1 ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s 3 4 @c = external global i32*, align 8 5 6 ; %val1 = load <2 x i8> 7 ; %op1 = zext<2 x i32> %val1 8 ; %val2 = load <2 x i8> 9 ; %op2 = zext<2 x i32> %val2 10 ; %rst = mul <2 x i32> %op1, %op2 11 ; 12 define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 13 ; CHECK-LABEL: mul_2xi8: 14 ; CHECK: # BB#0: # %entry 15 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 16 ; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx 17 ; CHECK-NEXT: movd %ecx, %xmm0 18 ; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx 19 ; CHECK-NEXT: movd %ecx, %xmm1 20 ; CHECK-NEXT: pxor %xmm2, %xmm2 21 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 22 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 23 ; CHECK-NEXT: pmullw %xmm0, %xmm1 24 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 25 ; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) 26 ; CHECK-NEXT: retq 27 entry: 28 %pre = load i32*, i32** @c 29 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 30 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 31 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 32 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 33 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 34 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 35 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 36 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 37 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 38 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 39 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 40 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 41 ret void 42 } 43 44 ; %val1 = load <4 x i8> 45 ; %op1 = zext<4 x i32> %val1 46 ; %val2 = load <4 x i8> 47 ; %op2 = zext<4 x i32> %val2 48 ; %rst = mul <4 x i32> %op1, %op2 49 ; 50 define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 51 ; CHECK-LABEL: mul_4xi8: 52 ; CHECK: # BB#0: # %entry 53 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 54 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 55 ; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 56 ; CHECK-NEXT: pxor %xmm2, %xmm2 57 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 58 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 59 ; CHECK-NEXT: pmullw %xmm0, %xmm1 60 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 61 ; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) 62 ; CHECK-NEXT: retq 63 entry: 64 %pre = load i32*, i32** @c 65 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 66 %tmp7 = bitcast i8* %tmp6 to <4 x i8>* 67 %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 68 %tmp8 = zext <4 x i8> %wide.load to <4 x i32> 69 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 70 %tmp11 = bitcast i8* %tmp10 to <4 x i8>* 71 %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 72 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> 73 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 74 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 75 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 76 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 77 ret void 78 } 79 80 ; %val1 = load <8 x i8> 81 ; %op1 = zext<8 x i32> %val1 82 ; %val2 = load <8 x i8> 83 ; %op2 = zext<8 x i32> %val2 84 ; %rst = mul <8 x i32> %op1, %op2 85 ; 86 define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 87 ; CHECK-LABEL: mul_8xi8: 88 ; CHECK: # BB#0: # %entry 89 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 90 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 91 ; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 92 ; CHECK-NEXT: pxor %xmm2, %xmm2 93 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 94 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 95 ; CHECK-NEXT: pmullw %xmm0, %xmm1 96 ; CHECK-NEXT: movdqa %xmm1, %xmm0 97 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 98 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 99 ; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 100 ; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 101 ; CHECK-NEXT: retq 102 entry: 103 %pre = load i32*, i32** @c 104 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 105 %tmp7 = bitcast i8* %tmp6 to <8 x i8>* 106 %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 107 %tmp8 = zext <8 x i8> %wide.load to <8 x i32> 108 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 109 %tmp11 = bitcast i8* %tmp10 to <8 x i8>* 110 %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 111 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> 112 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 113 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 114 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 115 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 116 ret void 117 } 118 119 ; %val1 = load <16 x i8> 120 ; %op1 = zext<16 x i32> %val1 121 ; %val2 = load <16 x i8> 122 ; %op2 = zext<16 x i32> %val2 123 ; %rst = mul <16 x i32> %op1, %op2 124 ; 125 define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 126 ; CHECK-LABEL: mul_16xi8: 127 ; CHECK: # BB#0: # %entry 128 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 129 ; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 130 ; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 131 ; CHECK-NEXT: pxor %xmm2, %xmm2 132 ; CHECK-NEXT: movdqa %xmm0, %xmm3 133 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 134 ; CHECK-NEXT: movdqa %xmm1, %xmm4 135 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 136 ; CHECK-NEXT: pmullw %xmm3, %xmm4 137 ; CHECK-NEXT: movdqa %xmm4, %xmm3 138 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 139 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 140 ; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 141 ; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 142 ; CHECK-NEXT: pmullw %xmm0, %xmm1 143 ; CHECK-NEXT: movdqa %xmm1, %xmm0 144 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 145 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 146 ; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 147 ; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) 148 ; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) 149 ; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4) 150 ; CHECK-NEXT: retq 151 entry: 152 %pre = load i32*, i32** @c 153 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 154 %tmp7 = bitcast i8* %tmp6 to <16 x i8>* 155 %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 156 %tmp8 = zext <16 x i8> %wide.load to <16 x i32> 157 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 158 %tmp11 = bitcast i8* %tmp10 to <16 x i8>* 159 %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 160 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> 161 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 162 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 163 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 164 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 165 ret void 166 } 167 168 ; %val1 = load <2 x i16> 169 ; %op1 = zext<2 x i32> %val1 170 ; %val2 = load <2 x i16> 171 ; %op2 = zext<2 x i32> %val2 172 ; %rst = mul <2 x i32> %op1, %op2 173 ; 174 define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 175 ; CHECK-LABEL: mul_2xi16: 176 ; CHECK: # BB#0: # %entry 177 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 178 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 179 ; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 180 ; CHECK-NEXT: movdqa %xmm1, %xmm2 181 ; CHECK-NEXT: pmulhuw %xmm0, %xmm2 182 ; CHECK-NEXT: pmullw %xmm0, %xmm1 183 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 184 ; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) 185 ; CHECK-NEXT: retq 186 entry: 187 %pre = load i32*, i32** @c 188 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 189 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 190 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 191 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 192 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 193 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 194 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 195 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 196 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 197 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 198 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 199 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 200 ret void 201 } 202 203 ; %val1 = load <4 x i16> 204 ; %op1 = zext<4 x i32> %val1 205 ; %val2 = load <4 x i16> 206 ; %op2 = zext<4 x i32> %val2 207 ; %rst = mul <4 x i32> %op1, %op2 208 ; 209 define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 210 ; CHECK-LABEL: mul_4xi16: 211 ; CHECK: # BB#0: # %entry 212 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 213 ; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 214 ; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 215 ; CHECK-NEXT: movdqa %xmm1, %xmm2 216 ; CHECK-NEXT: pmulhuw %xmm0, %xmm2 217 ; CHECK-NEXT: pmullw %xmm0, %xmm1 218 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 219 ; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4) 220 ; CHECK-NEXT: retq 221 entry: 222 %pre = load i32*, i32** @c 223 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 224 %tmp7 = bitcast i8* %tmp6 to <4 x i16>* 225 %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 226 %tmp8 = zext <4 x i16> %wide.load to <4 x i32> 227 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 228 %tmp11 = bitcast i8* %tmp10 to <4 x i16>* 229 %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 230 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> 231 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 232 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 233 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 234 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 235 ret void 236 } 237 238 ; %val1 = load <8 x i16> 239 ; %op1 = zext<8 x i32> %val1 240 ; %val2 = load <8 x i16> 241 ; %op2 = zext<8 x i32> %val2 242 ; %rst = mul <8 x i32> %op1, %op2 243 ; 244 define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 245 ; CHECK-LABEL: mul_8xi16: 246 ; CHECK: # BB#0: # %entry 247 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 248 ; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 249 ; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1 250 ; CHECK-NEXT: movdqa %xmm1, %xmm2 251 ; CHECK-NEXT: pmulhuw %xmm0, %xmm2 252 ; CHECK-NEXT: pmullw %xmm0, %xmm1 253 ; CHECK-NEXT: movdqa %xmm1, %xmm0 254 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 255 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 256 ; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 257 ; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 258 ; CHECK-NEXT: retq 259 entry: 260 %pre = load i32*, i32** @c 261 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 262 %tmp7 = bitcast i8* %tmp6 to <8 x i16>* 263 %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 264 %tmp8 = zext <8 x i16> %wide.load to <8 x i32> 265 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 266 %tmp11 = bitcast i8* %tmp10 to <8 x i16>* 267 %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 268 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> 269 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 270 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 271 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 272 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 273 ret void 274 } 275 276 ; %val1 = load <16 x i16> 277 ; %op1 = zext<16 x i32> %val1 278 ; %val2 = load <16 x i16> 279 ; %op2 = zext<16 x i32> %val2 280 ; %rst = mul <16 x i32> %op1, %op2 281 ; 282 define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 283 ; CHECK-LABEL: mul_16xi16: 284 ; CHECK: # BB#0: # %entry 285 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 286 ; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 287 ; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 288 ; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 289 ; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 290 ; CHECK-NEXT: movdqa %xmm2, %xmm4 291 ; CHECK-NEXT: pmulhuw %xmm0, %xmm4 292 ; CHECK-NEXT: pmullw %xmm0, %xmm2 293 ; CHECK-NEXT: movdqa %xmm2, %xmm0 294 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 295 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 296 ; CHECK-NEXT: movdqa %xmm3, %xmm4 297 ; CHECK-NEXT: pmulhuw %xmm1, %xmm4 298 ; CHECK-NEXT: pmullw %xmm1, %xmm3 299 ; CHECK-NEXT: movdqa %xmm3, %xmm1 300 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 301 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 302 ; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) 303 ; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) 304 ; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) 305 ; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 306 ; CHECK-NEXT: retq 307 entry: 308 %pre = load i32*, i32** @c 309 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 310 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 311 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 312 %tmp8 = zext <16 x i16> %wide.load to <16 x i32> 313 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 314 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 315 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 316 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> 317 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 318 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 319 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 320 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 321 ret void 322 } 323 324 ; %val1 = load <2 x i8> 325 ; %op1 = sext<2 x i32> %val1 326 ; %val2 = load <2 x i8> 327 ; %op2 = sext<2 x i32> %val2 328 ; %rst = mul <2 x i32> %op1, %op2 329 ; 330 define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 331 ; CHECK-LABEL: mul_2xi8_sext: 332 ; CHECK: # BB#0: # %entry 333 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 334 ; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx 335 ; CHECK-NEXT: movd %ecx, %xmm0 336 ; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx 337 ; CHECK-NEXT: movd %ecx, %xmm1 338 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 339 ; CHECK-NEXT: psraw $8, %xmm0 340 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 341 ; CHECK-NEXT: psraw $8, %xmm1 342 ; CHECK-NEXT: pmullw %xmm0, %xmm1 343 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 344 ; CHECK-NEXT: psrad $16, %xmm0 345 ; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) 346 ; CHECK-NEXT: retq 347 entry: 348 %pre = load i32*, i32** @c 349 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 350 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 351 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 352 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 353 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 354 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 355 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 356 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> 357 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 358 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 359 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 360 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 361 ret void 362 } 363 364 ; %val1 = load <2 x i8> 365 ; %op1 = sext<2 x i32> %val1 366 ; %val2 = load <2 x i8> 367 ; %op2 = zext<2 x i32> %val2 368 ; %rst = mul <2 x i32> %op1, %op2 369 ; 370 define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 371 ; CHECK-LABEL: mul_2xi8_sext_zext: 372 ; CHECK: # BB#0: # %entry 373 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 374 ; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx 375 ; CHECK-NEXT: movd %ecx, %xmm0 376 ; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx 377 ; CHECK-NEXT: movd %ecx, %xmm1 378 ; CHECK-NEXT: pxor %xmm2, %xmm2 379 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 380 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 381 ; CHECK-NEXT: psraw $8, %xmm0 382 ; CHECK-NEXT: movdqa %xmm1, %xmm2 383 ; CHECK-NEXT: pmulhw %xmm0, %xmm2 384 ; CHECK-NEXT: pmullw %xmm1, %xmm0 385 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 386 ; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) 387 ; CHECK-NEXT: retq 388 entry: 389 %pre = load i32*, i32** @c 390 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 391 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 392 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 393 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 394 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 395 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 396 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 397 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 398 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 399 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 400 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 401 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 402 ret void 403 } 404 405 ; %val1 = load <2 x i16> 406 ; %op1 = sext<2 x i32> %val1 407 ; %val2 = load <2 x i16> 408 ; %op2 = sext<2 x i32> %val2 409 ; %rst = mul <2 x i32> %op1, %op2 410 ; 411 define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 412 ; CHECK-LABEL: mul_2xi16_sext: 413 ; CHECK: # BB#0: # %entry 414 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 415 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 416 ; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 417 ; CHECK-NEXT: movdqa %xmm1, %xmm2 418 ; CHECK-NEXT: pmulhw %xmm0, %xmm2 419 ; CHECK-NEXT: pmullw %xmm0, %xmm1 420 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 421 ; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4) 422 ; CHECK-NEXT: retq 423 entry: 424 %pre = load i32*, i32** @c 425 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 426 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 427 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 428 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 429 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 430 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 431 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 432 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> 433 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 434 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 435 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 436 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 437 ret void 438 } 439 440 ; %val1 = load <2 x i16> 441 ; %op1 = sext<2 x i32> %val1 442 ; %val2 = load <2 x i16> 443 ; %op2 = zext<2 x i32> %val2 444 ; %rst = mul <2 x i32> %op1, %op2 445 ; 446 define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 447 ; CHECK-LABEL: mul_2xi16_sext_zext: 448 ; CHECK: # BB#0: # %entry 449 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 450 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 451 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 452 ; CHECK-NEXT: psrad $16, %xmm0 453 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 454 ; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 455 ; CHECK-NEXT: pxor %xmm2, %xmm2 456 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 457 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 458 ; CHECK-NEXT: movdqa %xmm1, %xmm2 459 ; CHECK-NEXT: pmuludq %xmm0, %xmm2 460 ; CHECK-NEXT: movdqa %xmm0, %xmm3 461 ; CHECK-NEXT: psrlq $32, %xmm3 462 ; CHECK-NEXT: pmuludq %xmm1, %xmm3 463 ; CHECK-NEXT: psllq $32, %xmm3 464 ; CHECK-NEXT: paddq %xmm2, %xmm3 465 ; CHECK-NEXT: psrlq $32, %xmm1 466 ; CHECK-NEXT: pmuludq %xmm0, %xmm1 467 ; CHECK-NEXT: psllq $32, %xmm1 468 ; CHECK-NEXT: paddq %xmm3, %xmm1 469 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 470 ; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4) 471 ; CHECK-NEXT: retq 472 entry: 473 %pre = load i32*, i32** @c 474 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 475 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 476 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 477 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 478 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 479 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 480 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 481 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 482 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 483 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 484 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 485 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 486 ret void 487 } 488 489 ; %val1 = load <16 x i16> 490 ; %op1 = sext<16 x i32> %val1 491 ; %val2 = load <16 x i16> 492 ; %op2 = sext<16 x i32> %val2 493 ; %rst = mul <16 x i32> %op1, %op2 494 ; 495 define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 496 ; CHECK-LABEL: mul_16xi16_sext: 497 ; CHECK: # BB#0: # %entry 498 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 499 ; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0 500 ; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1 501 ; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2 502 ; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3 503 ; CHECK-NEXT: movdqa %xmm2, %xmm4 504 ; CHECK-NEXT: pmulhw %xmm0, %xmm4 505 ; CHECK-NEXT: pmullw %xmm0, %xmm2 506 ; CHECK-NEXT: movdqa %xmm2, %xmm0 507 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 508 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 509 ; CHECK-NEXT: movdqa %xmm3, %xmm4 510 ; CHECK-NEXT: pmulhw %xmm1, %xmm4 511 ; CHECK-NEXT: pmullw %xmm1, %xmm3 512 ; CHECK-NEXT: movdqa %xmm3, %xmm1 513 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 514 ; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 515 ; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) 516 ; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) 517 ; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) 518 ; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4) 519 ; CHECK-NEXT: retq 520 entry: 521 %pre = load i32*, i32** @c 522 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 523 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 524 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 525 %tmp8 = sext <16 x i16> %wide.load to <16 x i32> 526 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 527 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 528 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 529 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> 530 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 531 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 532 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 533 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 534 ret void 535 } 536 537 ; %val = load <2 x i8> 538 ; %op1 = zext<2 x i32> %val 539 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) 540 ; %rst = mul <2 x i32> %op1, %op2 541 ; 542 define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { 543 ; CHECK-LABEL: mul_2xi8_varconst1: 544 ; CHECK: # BB#0: # %entry 545 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 546 ; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 547 ; CHECK-NEXT: movd %ecx, %xmm0 548 ; CHECK-NEXT: pxor %xmm1, %xmm1 549 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 550 ; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 551 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 552 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 553 ; CHECK-NEXT: retq 554 entry: 555 %pre = load i32*, i32** @c 556 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 557 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 558 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 559 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 560 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255> 561 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 562 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 563 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 564 ret void 565 } 566 567 ; %val = load <2 x i8> 568 ; %op1 = sext<2 x i32> %val 569 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) 570 ; %rst = mul <2 x i32> %op1, %op2 571 ; 572 define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { 573 ; CHECK-LABEL: mul_2xi8_varconst2: 574 ; CHECK: # BB#0: # %entry 575 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 576 ; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 577 ; CHECK-NEXT: movd %ecx, %xmm0 578 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 579 ; CHECK-NEXT: psraw $8, %xmm0 580 ; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0 581 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 582 ; CHECK-NEXT: psrad $16, %xmm0 583 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 584 ; CHECK-NEXT: retq 585 entry: 586 %pre = load i32*, i32** @c 587 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 588 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 589 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 590 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 591 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127> 592 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 593 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 594 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 595 ret void 596 } 597 598 ; %val = load <2 x i8> 599 ; %op1 = zext<2 x i32> %val 600 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) 601 ; %rst = mul <2 x i32> %op1, %op2 602 ; 603 define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { 604 ; CHECK-LABEL: mul_2xi8_varconst3: 605 ; CHECK: # BB#0: # %entry 606 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 607 ; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 608 ; CHECK-NEXT: movd %ecx, %xmm0 609 ; CHECK-NEXT: pxor %xmm1, %xmm1 610 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 611 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> 612 ; CHECK-NEXT: movdqa %xmm0, %xmm2 613 ; CHECK-NEXT: pmulhw %xmm1, %xmm2 614 ; CHECK-NEXT: pmullw %xmm1, %xmm0 615 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 616 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 617 ; CHECK-NEXT: retq 618 entry: 619 %pre = load i32*, i32** @c 620 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 621 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 622 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 623 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 624 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256> 625 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 626 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 627 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 628 ret void 629 } 630 631 ; %val = load <2 x i8> 632 ; %op1 = zext<2 x i32> %val 633 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) 634 ; %rst = mul <2 x i32> %op1, %op2 635 ; 636 define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { 637 ; CHECK-LABEL: mul_2xi8_varconst4: 638 ; CHECK: # BB#0: # %entry 639 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 640 ; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 641 ; CHECK-NEXT: movd %ecx, %xmm0 642 ; CHECK-NEXT: pxor %xmm1, %xmm1 643 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 644 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> 645 ; CHECK-NEXT: movdqa %xmm0, %xmm2 646 ; CHECK-NEXT: pmulhw %xmm1, %xmm2 647 ; CHECK-NEXT: pmullw %xmm1, %xmm0 648 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 649 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 650 ; CHECK-NEXT: retq 651 entry: 652 %pre = load i32*, i32** @c 653 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 654 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 655 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 656 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 657 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255> 658 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 659 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 660 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 661 ret void 662 } 663 664 ; %val = load <2 x i8> 665 ; %op1 = sext<2 x i32> %val 666 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) 667 ; %rst = mul <2 x i32> %op1, %op2 668 ; 669 define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { 670 ; CHECK-LABEL: mul_2xi8_varconst5: 671 ; CHECK: # BB#0: # %entry 672 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 673 ; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 674 ; CHECK-NEXT: movd %ecx, %xmm0 675 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 676 ; CHECK-NEXT: psraw $8, %xmm0 677 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> 678 ; CHECK-NEXT: movdqa %xmm0, %xmm2 679 ; CHECK-NEXT: pmulhw %xmm1, %xmm2 680 ; CHECK-NEXT: pmullw %xmm1, %xmm0 681 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 682 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 683 ; CHECK-NEXT: retq 684 entry: 685 %pre = load i32*, i32** @c 686 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 687 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 688 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 689 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 690 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127> 691 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 692 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 693 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 694 ret void 695 } 696 697 ; %val = load <2 x i8> 698 ; %op1 = sext<2 x i32> %val 699 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) 700 ; %rst = mul <2 x i32> %op1, %op2 701 ; 702 define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { 703 ; CHECK-LABEL: mul_2xi8_varconst6: 704 ; CHECK: # BB#0: # %entry 705 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 706 ; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx 707 ; CHECK-NEXT: movd %ecx, %xmm0 708 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 709 ; CHECK-NEXT: psraw $8, %xmm0 710 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> 711 ; CHECK-NEXT: movdqa %xmm0, %xmm2 712 ; CHECK-NEXT: pmulhw %xmm1, %xmm2 713 ; CHECK-NEXT: pmullw %xmm1, %xmm0 714 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 715 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 716 ; CHECK-NEXT: retq 717 entry: 718 %pre = load i32*, i32** @c 719 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 720 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 721 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 722 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 723 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128> 724 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 725 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 726 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 727 ret void 728 } 729 730 ; %val = load <2 x i16> 731 ; %op1 = zext<2 x i32> %val 732 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) 733 ; %rst = mul <2 x i32> %op1, %op2 734 ; 735 define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { 736 ; CHECK-LABEL: mul_2xi16_varconst1: 737 ; CHECK: # BB#0: # %entry 738 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 739 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 740 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> 741 ; CHECK-NEXT: movdqa %xmm0, %xmm2 742 ; CHECK-NEXT: pmulhuw %xmm1, %xmm2 743 ; CHECK-NEXT: pmullw %xmm1, %xmm0 744 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 745 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 746 ; CHECK-NEXT: retq 747 entry: 748 %pre = load i32*, i32** @c 749 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 750 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 751 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 752 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 753 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535> 754 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 755 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 756 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 757 ret void 758 } 759 760 ; %val = load <2 x i16> 761 ; %op1 = sext<2 x i32> %val 762 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) 763 ; %rst = mul <2 x i32> %op1, %op2 764 ; 765 define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { 766 ; CHECK-LABEL: mul_2xi16_varconst2: 767 ; CHECK: # BB#0: # %entry 768 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 769 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 770 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> 771 ; CHECK-NEXT: movdqa %xmm0, %xmm2 772 ; CHECK-NEXT: pmulhw %xmm1, %xmm2 773 ; CHECK-NEXT: pmullw %xmm1, %xmm0 774 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 775 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 776 ; CHECK-NEXT: retq 777 entry: 778 %pre = load i32*, i32** @c 779 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 780 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 781 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 782 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 783 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767> 784 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 785 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 786 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 787 ret void 788 } 789 790 ; %val = load <2 x i16> 791 ; %op1 = zext<2 x i32> %val 792 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) 793 ; %rst = mul <2 x i32> %op1, %op2 794 ; 795 define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { 796 ; CHECK-LABEL: mul_2xi16_varconst3: 797 ; CHECK: # BB#0: # %entry 798 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 799 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 800 ; CHECK-NEXT: pxor %xmm1, %xmm1 801 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 802 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 803 ; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000 804 ; CHECK-NEXT: movd %rcx, %xmm1 805 ; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 806 ; CHECK-NEXT: movdqa %xmm0, %xmm2 807 ; CHECK-NEXT: pmuludq %xmm1, %xmm2 808 ; CHECK-NEXT: psrlq $32, %xmm0 809 ; CHECK-NEXT: pmuludq %xmm1, %xmm0 810 ; CHECK-NEXT: psllq $32, %xmm0 811 ; CHECK-NEXT: paddq %xmm2, %xmm0 812 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 813 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 814 ; CHECK-NEXT: retq 815 entry: 816 %pre = load i32*, i32** @c 817 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 818 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 819 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 820 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 821 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> 822 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 823 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 824 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 825 ret void 826 } 827 828 ; %val = load <2 x i16> 829 ; %op1 = sext<2 x i32> %val 830 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) 831 ; %rst = mul <2 x i32> %op1, %op2 832 ; 833 define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { 834 ; CHECK-LABEL: mul_2xi16_varconst4: 835 ; CHECK: # BB#0: # %entry 836 ; CHECK-NEXT: movq {{.*}}(%rip), %rax 837 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 838 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 839 ; CHECK-NEXT: psrad $16, %xmm0 840 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 841 ; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000 842 ; CHECK-NEXT: movd %rcx, %xmm1 843 ; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 844 ; CHECK-NEXT: movdqa %xmm0, %xmm2 845 ; CHECK-NEXT: pmuludq %xmm1, %xmm2 846 ; CHECK-NEXT: psrlq $32, %xmm0 847 ; CHECK-NEXT: pmuludq %xmm1, %xmm0 848 ; CHECK-NEXT: psllq $32, %xmm0 849 ; CHECK-NEXT: paddq %xmm2, %xmm0 850 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 851 ; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4) 852 ; CHECK-NEXT: retq 853 entry: 854 %pre = load i32*, i32** @c 855 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 856 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 857 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 858 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 859 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768> 860 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 861 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 862 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 863 ret void 864 } 865