1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 4 5 ; Test based on pr5626 to load/store 6 ; 7 8 %i32vec3 = type <3 x i32> 9 define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { 10 ; X86-LABEL: add3i32: 11 ; X86: # %bb.0: 12 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 13 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 14 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 15 ; X86-NEXT: movdqa (%edx), %xmm0 16 ; X86-NEXT: paddd (%ecx), %xmm0 17 ; X86-NEXT: pextrd $2, %xmm0, 8(%eax) 18 ; X86-NEXT: pextrd $1, %xmm0, 4(%eax) 19 ; X86-NEXT: movd %xmm0, (%eax) 20 ; X86-NEXT: retl $4 21 ; 22 ; X64-LABEL: add3i32: 23 ; X64: # %bb.0: 24 ; X64-NEXT: movdqa (%rsi), %xmm0 25 ; X64-NEXT: paddd (%rdx), %xmm0 26 ; X64-NEXT: pextrd $2, %xmm0, 8(%rdi) 27 ; X64-NEXT: movq %xmm0, (%rdi) 28 ; X64-NEXT: movq %rdi, %rax 29 ; X64-NEXT: retq 30 %a = load %i32vec3, %i32vec3* %ap, align 16 31 %b = load %i32vec3, %i32vec3* %bp, align 16 32 %x = add %i32vec3 %a, %b 33 store %i32vec3 %x, %i32vec3* %ret, align 16 34 ret void 35 } 36 37 define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { 38 ; X86-LABEL: add3i32_2: 39 ; X86: # %bb.0: 40 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 41 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 42 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 43 ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 44 ; X86-NEXT: pinsrd $1, 4(%edx), %xmm0 45 ; X86-NEXT: pinsrd $2, 8(%edx), %xmm0 46 ; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 47 ; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 48 ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 49 ; X86-NEXT: paddd %xmm0, %xmm1 50 ; X86-NEXT: pextrd $2, %xmm1, 8(%eax) 51 ; X86-NEXT: pextrd $1, %xmm1, 4(%eax) 52 ; X86-NEXT: movd %xmm1, (%eax) 53 ; X86-NEXT: retl $4 54 ; 55 ; X64-LABEL: add3i32_2: 56 ; X64: # %bb.0: 57 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 58 ; X64-NEXT: pinsrd $2, 8(%rsi), %xmm0 59 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 60 ; X64-NEXT: pinsrd $2, 8(%rdx), %xmm1 61 ; X64-NEXT: paddd %xmm0, %xmm1 62 ; X64-NEXT: pextrd $2, %xmm1, 8(%rdi) 63 ; X64-NEXT: movq %xmm1, (%rdi) 64 ; X64-NEXT: movq %rdi, %rax 65 ; X64-NEXT: retq 66 %a = load %i32vec3, %i32vec3* %ap, align 8 67 %b = load %i32vec3, %i32vec3* %bp, align 8 68 %x = add %i32vec3 %a, %b 69 store %i32vec3 %x, %i32vec3* %ret, align 8 70 ret void 71 } 72 73 %i32vec7 = type <7 x i32> 74 define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { 75 ; X86-LABEL: add7i32: 76 ; X86: # %bb.0: 77 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 78 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 79 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 80 ; X86-NEXT: movdqa (%edx), %xmm0 81 ; X86-NEXT: movdqa 16(%edx), %xmm1 82 ; X86-NEXT: paddd (%ecx), %xmm0 83 ; X86-NEXT: paddd 16(%ecx), %xmm1 84 ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) 85 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) 86 ; X86-NEXT: movd %xmm1, 16(%eax) 87 ; X86-NEXT: movdqa %xmm0, (%eax) 88 ; X86-NEXT: retl $4 89 ; 90 ; X64-LABEL: add7i32: 91 ; X64: # %bb.0: 92 ; X64-NEXT: movdqa (%rsi), %xmm0 93 ; X64-NEXT: movdqa 16(%rsi), %xmm1 94 ; X64-NEXT: paddd (%rdx), %xmm0 95 ; X64-NEXT: paddd 16(%rdx), %xmm1 96 ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) 97 ; X64-NEXT: movq %xmm1, 16(%rdi) 98 ; X64-NEXT: movdqa %xmm0, (%rdi) 99 ; X64-NEXT: movq %rdi, %rax 100 ; X64-NEXT: retq 101 %a = load %i32vec7, %i32vec7* %ap, align 16 102 %b = load %i32vec7, %i32vec7* %bp, align 16 103 %x = add %i32vec7 %a, %b 104 store %i32vec7 %x, %i32vec7* %ret, align 16 105 ret void 106 } 107 108 %i32vec12 = type <12 x i32> 109 define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { 110 ; X86-LABEL: add12i32: 111 ; X86: # %bb.0: 112 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 113 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 114 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 115 ; X86-NEXT: movdqa 32(%edx), %xmm0 116 ; X86-NEXT: movdqa (%edx), %xmm1 117 ; X86-NEXT: movdqa 16(%edx), %xmm2 118 ; X86-NEXT: paddd (%ecx), %xmm1 119 ; X86-NEXT: paddd 16(%ecx), %xmm2 120 ; X86-NEXT: paddd 32(%ecx), %xmm0 121 ; X86-NEXT: movdqa %xmm0, 32(%eax) 122 ; X86-NEXT: movdqa %xmm2, 16(%eax) 123 ; X86-NEXT: movdqa %xmm1, (%eax) 124 ; X86-NEXT: retl $4 125 ; 126 ; X64-LABEL: add12i32: 127 ; X64: # %bb.0: 128 ; X64-NEXT: movdqa (%rsi), %xmm0 129 ; X64-NEXT: movdqa 16(%rsi), %xmm1 130 ; X64-NEXT: movdqa 32(%rsi), %xmm2 131 ; X64-NEXT: paddd (%rdx), %xmm0 132 ; X64-NEXT: paddd 16(%rdx), %xmm1 133 ; X64-NEXT: paddd 32(%rdx), %xmm2 134 ; X64-NEXT: movdqa %xmm2, 32(%rdi) 135 ; X64-NEXT: movdqa %xmm1, 16(%rdi) 136 ; X64-NEXT: movdqa %xmm0, (%rdi) 137 ; X64-NEXT: movq %rdi, %rax 138 ; X64-NEXT: retq 139 %a = load %i32vec12, %i32vec12* %ap, align 16 140 %b = load %i32vec12, %i32vec12* %bp, align 16 141 %x = add %i32vec12 %a, %b 142 store %i32vec12 %x, %i32vec12* %ret, align 16 143 ret void 144 } 145 146 147 %i16vec3 = type <3 x i16> 148 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { 149 ; X86-LABEL: add3i16: 150 ; X86: # %bb.0: 151 ; X86-NEXT: pushl %ebp 152 ; X86-NEXT: movl %esp, %ebp 153 ; X86-NEXT: andl $-8, %esp 154 ; X86-NEXT: subl $24, %esp 155 ; X86-NEXT: movl 8(%ebp), %eax 156 ; X86-NEXT: movl 16(%ebp), %ecx 157 ; X86-NEXT: movl 12(%ebp), %edx 158 ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 159 ; X86-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 160 ; X86-NEXT: pinsrd $2, 4(%edx), %xmm0 161 ; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 162 ; X86-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 163 ; X86-NEXT: pinsrd $2, 4(%ecx), %xmm1 164 ; X86-NEXT: paddd %xmm0, %xmm1 165 ; X86-NEXT: pextrw $4, %xmm1, 4(%eax) 166 ; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 167 ; X86-NEXT: movd %xmm1, (%eax) 168 ; X86-NEXT: movl %ebp, %esp 169 ; X86-NEXT: popl %ebp 170 ; X86-NEXT: retl $4 171 ; 172 ; X64-LABEL: add3i16: 173 ; X64: # %bb.0: 174 ; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 175 ; X64-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 176 ; X64-NEXT: paddd %xmm0, %xmm1 177 ; X64-NEXT: pextrw $4, %xmm1, 4(%rdi) 178 ; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 179 ; X64-NEXT: movd %xmm1, (%rdi) 180 ; X64-NEXT: movq %rdi, %rax 181 ; X64-NEXT: retq 182 %a = load %i16vec3, %i16vec3* %ap, align 16 183 %b = load %i16vec3, %i16vec3* %bp, align 16 184 %x = add %i16vec3 %a, %b 185 store %i16vec3 %x, %i16vec3* %ret, align 16 186 ret void 187 } 188 189 %i16vec4 = type <4 x i16> 190 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind { 191 ; X86-LABEL: add4i16: 192 ; X86: # %bb.0: 193 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 194 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 195 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 196 ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 197 ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 198 ; X86-NEXT: paddw %xmm0, %xmm1 199 ; X86-NEXT: movq %xmm1, (%eax) 200 ; X86-NEXT: retl $4 201 ; 202 ; X64-LABEL: add4i16: 203 ; X64: # %bb.0: 204 ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 205 ; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 206 ; X64-NEXT: paddw %xmm0, %xmm1 207 ; X64-NEXT: movq %xmm1, (%rdi) 208 ; X64-NEXT: movq %rdi, %rax 209 ; X64-NEXT: retq 210 %a = load %i16vec4, %i16vec4* %ap, align 16 211 %b = load %i16vec4, %i16vec4* %bp, align 16 212 %x = add %i16vec4 %a, %b 213 store %i16vec4 %x, %i16vec4* %ret, align 16 214 ret void 215 } 216 217 %i16vec12 = type <12 x i16> 218 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind { 219 ; X86-LABEL: add12i16: 220 ; X86: # %bb.0: 221 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 222 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 223 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 224 ; X86-NEXT: movdqa (%edx), %xmm0 225 ; X86-NEXT: movdqa 16(%edx), %xmm1 226 ; X86-NEXT: paddw (%ecx), %xmm0 227 ; X86-NEXT: paddw 16(%ecx), %xmm1 228 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) 229 ; X86-NEXT: movd %xmm1, 16(%eax) 230 ; X86-NEXT: movdqa %xmm0, (%eax) 231 ; X86-NEXT: retl $4 232 ; 233 ; X64-LABEL: add12i16: 234 ; X64: # %bb.0: 235 ; X64-NEXT: movdqa (%rsi), %xmm0 236 ; X64-NEXT: movdqa 16(%rsi), %xmm1 237 ; X64-NEXT: paddw (%rdx), %xmm0 238 ; X64-NEXT: paddw 16(%rdx), %xmm1 239 ; X64-NEXT: movq %xmm1, 16(%rdi) 240 ; X64-NEXT: movdqa %xmm0, (%rdi) 241 ; X64-NEXT: movq %rdi, %rax 242 ; X64-NEXT: retq 243 %a = load %i16vec12, %i16vec12* %ap, align 16 244 %b = load %i16vec12, %i16vec12* %bp, align 16 245 %x = add %i16vec12 %a, %b 246 store %i16vec12 %x, %i16vec12* %ret, align 16 247 ret void 248 } 249 250 %i16vec18 = type <18 x i16> 251 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind { 252 ; X86-LABEL: add18i16: 253 ; X86: # %bb.0: 254 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 255 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 256 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 257 ; X86-NEXT: movdqa 32(%edx), %xmm0 258 ; X86-NEXT: movdqa (%edx), %xmm1 259 ; X86-NEXT: movdqa 16(%edx), %xmm2 260 ; X86-NEXT: paddw (%ecx), %xmm1 261 ; X86-NEXT: paddw 16(%ecx), %xmm2 262 ; X86-NEXT: paddw 32(%ecx), %xmm0 263 ; X86-NEXT: movd %xmm0, 32(%eax) 264 ; X86-NEXT: movdqa %xmm2, 16(%eax) 265 ; X86-NEXT: movdqa %xmm1, (%eax) 266 ; X86-NEXT: retl $4 267 ; 268 ; X64-LABEL: add18i16: 269 ; X64: # %bb.0: 270 ; X64-NEXT: movdqa (%rsi), %xmm0 271 ; X64-NEXT: movdqa 16(%rsi), %xmm1 272 ; X64-NEXT: movdqa 32(%rsi), %xmm2 273 ; X64-NEXT: paddw (%rdx), %xmm0 274 ; X64-NEXT: paddw 16(%rdx), %xmm1 275 ; X64-NEXT: paddw 32(%rdx), %xmm2 276 ; X64-NEXT: movd %xmm2, 32(%rdi) 277 ; X64-NEXT: movdqa %xmm1, 16(%rdi) 278 ; X64-NEXT: movdqa %xmm0, (%rdi) 279 ; X64-NEXT: movq %rdi, %rax 280 ; X64-NEXT: retq 281 %a = load %i16vec18, %i16vec18* %ap, align 16 282 %b = load %i16vec18, %i16vec18* %bp, align 16 283 %x = add %i16vec18 %a, %b 284 store %i16vec18 %x, %i16vec18* %ret, align 16 285 ret void 286 } 287 288 289 %i8vec3 = type <3 x i8> 290 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { 291 ; X86-LABEL: add3i8: 292 ; X86: # %bb.0: 293 ; X86-NEXT: subl $12, %esp 294 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 295 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 296 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 297 ; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 298 ; X86-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 299 ; X86-NEXT: paddd %xmm0, %xmm1 300 ; X86-NEXT: pextrb $8, %xmm1, 2(%eax) 301 ; X86-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 302 ; X86-NEXT: pextrw $0, %xmm1, (%eax) 303 ; X86-NEXT: addl $12, %esp 304 ; X86-NEXT: retl $4 305 ; 306 ; X64-LABEL: add3i8: 307 ; X64: # %bb.0: 308 ; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 309 ; X64-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 310 ; X64-NEXT: paddd %xmm0, %xmm1 311 ; X64-NEXT: pextrb $8, %xmm1, 2(%rdi) 312 ; X64-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 313 ; X64-NEXT: pextrw $0, %xmm1, (%rdi) 314 ; X64-NEXT: movq %rdi, %rax 315 ; X64-NEXT: retq 316 %a = load %i8vec3, %i8vec3* %ap, align 16 317 %b = load %i8vec3, %i8vec3* %bp, align 16 318 %x = add %i8vec3 %a, %b 319 store %i8vec3 %x, %i8vec3* %ret, align 16 320 ret void 321 } 322 323 %i8vec31 = type <31 x i8> 324 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind { 325 ; X86-LABEL: add31i8: 326 ; X86: # %bb.0: 327 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 328 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 329 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 330 ; X86-NEXT: movdqa (%edx), %xmm0 331 ; X86-NEXT: movdqa 16(%edx), %xmm1 332 ; X86-NEXT: paddb (%ecx), %xmm0 333 ; X86-NEXT: paddb 16(%ecx), %xmm1 334 ; X86-NEXT: pextrb $14, %xmm1, 30(%eax) 335 ; X86-NEXT: pextrw $6, %xmm1, 28(%eax) 336 ; X86-NEXT: pextrd $2, %xmm1, 24(%eax) 337 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax) 338 ; X86-NEXT: movd %xmm1, 16(%eax) 339 ; X86-NEXT: movdqa %xmm0, (%eax) 340 ; X86-NEXT: retl $4 341 ; 342 ; X64-LABEL: add31i8: 343 ; X64: # %bb.0: 344 ; X64-NEXT: movdqa (%rsi), %xmm0 345 ; X64-NEXT: movdqa 16(%rsi), %xmm1 346 ; X64-NEXT: paddb (%rdx), %xmm0 347 ; X64-NEXT: paddb 16(%rdx), %xmm1 348 ; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) 349 ; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) 350 ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) 351 ; X64-NEXT: movq %xmm1, 16(%rdi) 352 ; X64-NEXT: movdqa %xmm0, (%rdi) 353 ; X64-NEXT: movq %rdi, %rax 354 ; X64-NEXT: retq 355 %a = load %i8vec31, %i8vec31* %ap, align 16 356 %b = load %i8vec31, %i8vec31* %bp, align 16 357 %x = add %i8vec31 %a, %b 358 store %i8vec31 %x, %i8vec31* %ret, align 16 359 ret void 360 } 361 362 363 %i8vec3pack = type { <3 x i8>, i8 } 364 define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { 365 ; X86-LABEL: rot: 366 ; X86: # %bb.0: # %entry 367 ; X86-NEXT: subl $16, %esp 368 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 369 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 370 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 371 ; X86-NEXT: movb $-98, 2(%edx) 372 ; X86-NEXT: movw $-24930, (%edx) # imm = 0x9E9E 373 ; X86-NEXT: movb $1, 2(%ecx) 374 ; X86-NEXT: movw $257, (%ecx) # imm = 0x101 375 ; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 376 ; X86-NEXT: movdqa %xmm0, %xmm1 377 ; X86-NEXT: psrld $1, %xmm1 378 ; X86-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 379 ; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 380 ; X86-NEXT: pextrb $8, %xmm1, 2(%eax) 381 ; X86-NEXT: pextrw $0, %xmm0, (%eax) 382 ; X86-NEXT: addl $16, %esp 383 ; X86-NEXT: retl $4 384 ; 385 ; X64-LABEL: rot: 386 ; X64: # %bb.0: # %entry 387 ; X64-NEXT: movb $-98, 2(%rsi) 388 ; X64-NEXT: movw $-24930, (%rsi) # imm = 0x9E9E 389 ; X64-NEXT: movb $1, 2(%rdx) 390 ; X64-NEXT: movw $257, (%rdx) # imm = 0x101 391 ; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 392 ; X64-NEXT: movdqa %xmm0, %xmm1 393 ; X64-NEXT: psrld $1, %xmm1 394 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 395 ; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 396 ; X64-NEXT: pextrb $8, %xmm1, 2(%rdi) 397 ; X64-NEXT: pextrw $0, %xmm0, (%rdi) 398 ; X64-NEXT: movq %rdi, %rax 399 ; X64-NEXT: retq 400 entry: 401 %storetmp = bitcast %i8vec3pack* %X to <3 x i8>* 402 store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp 403 %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>* 404 store <3 x i8> <i8 1, i8 1, i8 1>, <3 x i8>* %storetmp1 405 %tmp = load %i8vec3pack, %i8vec3pack* %X 406 %extractVec = extractvalue %i8vec3pack %tmp, 0 407 %tmp2 = load %i8vec3pack, %i8vec3pack* %rot 408 %extractVec3 = extractvalue %i8vec3pack %tmp2, 0 409 %shr = lshr <3 x i8> %extractVec, %extractVec3 410 %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>* 411 store <3 x i8> %shr, <3 x i8>* %storetmp4 412 ret void 413 } 414 415