Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
      3 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
      4 
      5 ; Test based on pr5626 to load/store
      6 ;
      7 
      8 %i32vec3 = type <3 x i32>
      9 define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
     10 ; X86-LABEL: add3i32:
     11 ; X86:       # %bb.0:
     12 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
     13 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     14 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
     15 ; X86-NEXT:    movdqa (%edx), %xmm0
     16 ; X86-NEXT:    paddd (%ecx), %xmm0
     17 ; X86-NEXT:    pextrd $2, %xmm0, 8(%eax)
     18 ; X86-NEXT:    pextrd $1, %xmm0, 4(%eax)
     19 ; X86-NEXT:    movd %xmm0, (%eax)
     20 ; X86-NEXT:    retl $4
     21 ;
     22 ; X64-LABEL: add3i32:
     23 ; X64:       # %bb.0:
     24 ; X64-NEXT:    movdqa (%rsi), %xmm0
     25 ; X64-NEXT:    paddd (%rdx), %xmm0
     26 ; X64-NEXT:    pextrd $2, %xmm0, 8(%rdi)
     27 ; X64-NEXT:    movq %xmm0, (%rdi)
     28 ; X64-NEXT:    movq %rdi, %rax
     29 ; X64-NEXT:    retq
     30 	%a = load %i32vec3, %i32vec3* %ap, align 16
     31 	%b = load %i32vec3, %i32vec3* %bp, align 16
     32 	%x = add %i32vec3 %a, %b
     33 	store %i32vec3 %x, %i32vec3* %ret, align 16
     34 	ret void
     35 }
     36 
     37 define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
     38 ; X86-LABEL: add3i32_2:
     39 ; X86:       # %bb.0:
     40 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
     41 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     42 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
     43 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     44 ; X86-NEXT:    pinsrd $1, 4(%edx), %xmm0
     45 ; X86-NEXT:    pinsrd $2, 8(%edx), %xmm0
     46 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
     47 ; X86-NEXT:    pinsrd $1, 4(%ecx), %xmm1
     48 ; X86-NEXT:    pinsrd $2, 8(%ecx), %xmm1
     49 ; X86-NEXT:    paddd %xmm0, %xmm1
     50 ; X86-NEXT:    pextrd $2, %xmm1, 8(%eax)
     51 ; X86-NEXT:    pextrd $1, %xmm1, 4(%eax)
     52 ; X86-NEXT:    movd %xmm1, (%eax)
     53 ; X86-NEXT:    retl $4
     54 ;
     55 ; X64-LABEL: add3i32_2:
     56 ; X64:       # %bb.0:
     57 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
     58 ; X64-NEXT:    pinsrd $2, 8(%rsi), %xmm0
     59 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     60 ; X64-NEXT:    pinsrd $2, 8(%rdx), %xmm1
     61 ; X64-NEXT:    paddd %xmm0, %xmm1
     62 ; X64-NEXT:    pextrd $2, %xmm1, 8(%rdi)
     63 ; X64-NEXT:    movq %xmm1, (%rdi)
     64 ; X64-NEXT:    movq %rdi, %rax
     65 ; X64-NEXT:    retq
     66 	%a = load %i32vec3, %i32vec3* %ap, align 8
     67 	%b = load %i32vec3, %i32vec3* %bp, align 8
     68 	%x = add %i32vec3 %a, %b
     69 	store %i32vec3 %x, %i32vec3* %ret, align 8
     70 	ret void
     71 }
     72 
     73 %i32vec7 = type <7 x i32>
     74 define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
     75 ; X86-LABEL: add7i32:
     76 ; X86:       # %bb.0:
     77 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
     78 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     79 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
     80 ; X86-NEXT:    movdqa (%edx), %xmm0
     81 ; X86-NEXT:    movdqa 16(%edx), %xmm1
     82 ; X86-NEXT:    paddd (%ecx), %xmm0
     83 ; X86-NEXT:    paddd 16(%ecx), %xmm1
     84 ; X86-NEXT:    pextrd $2, %xmm1, 24(%eax)
     85 ; X86-NEXT:    pextrd $1, %xmm1, 20(%eax)
     86 ; X86-NEXT:    movd %xmm1, 16(%eax)
     87 ; X86-NEXT:    movdqa %xmm0, (%eax)
     88 ; X86-NEXT:    retl $4
     89 ;
     90 ; X64-LABEL: add7i32:
     91 ; X64:       # %bb.0:
     92 ; X64-NEXT:    movdqa (%rsi), %xmm0
     93 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
     94 ; X64-NEXT:    paddd (%rdx), %xmm0
     95 ; X64-NEXT:    paddd 16(%rdx), %xmm1
     96 ; X64-NEXT:    pextrd $2, %xmm1, 24(%rdi)
     97 ; X64-NEXT:    movq %xmm1, 16(%rdi)
     98 ; X64-NEXT:    movdqa %xmm0, (%rdi)
     99 ; X64-NEXT:    movq %rdi, %rax
    100 ; X64-NEXT:    retq
    101 	%a = load %i32vec7, %i32vec7* %ap, align 16
    102 	%b = load %i32vec7, %i32vec7* %bp, align 16
    103 	%x = add %i32vec7 %a, %b
    104 	store %i32vec7 %x, %i32vec7* %ret, align 16
    105 	ret void
    106 }
    107 
    108 %i32vec12 = type <12 x i32>
    109 define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
    110 ; X86-LABEL: add12i32:
    111 ; X86:       # %bb.0:
    112 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    113 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    114 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    115 ; X86-NEXT:    movdqa 32(%edx), %xmm0
    116 ; X86-NEXT:    movdqa (%edx), %xmm1
    117 ; X86-NEXT:    movdqa 16(%edx), %xmm2
    118 ; X86-NEXT:    paddd (%ecx), %xmm1
    119 ; X86-NEXT:    paddd 16(%ecx), %xmm2
    120 ; X86-NEXT:    paddd 32(%ecx), %xmm0
    121 ; X86-NEXT:    movdqa %xmm0, 32(%eax)
    122 ; X86-NEXT:    movdqa %xmm2, 16(%eax)
    123 ; X86-NEXT:    movdqa %xmm1, (%eax)
    124 ; X86-NEXT:    retl $4
    125 ;
    126 ; X64-LABEL: add12i32:
    127 ; X64:       # %bb.0:
    128 ; X64-NEXT:    movdqa (%rsi), %xmm0
    129 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
    130 ; X64-NEXT:    movdqa 32(%rsi), %xmm2
    131 ; X64-NEXT:    paddd (%rdx), %xmm0
    132 ; X64-NEXT:    paddd 16(%rdx), %xmm1
    133 ; X64-NEXT:    paddd 32(%rdx), %xmm2
    134 ; X64-NEXT:    movdqa %xmm2, 32(%rdi)
    135 ; X64-NEXT:    movdqa %xmm1, 16(%rdi)
    136 ; X64-NEXT:    movdqa %xmm0, (%rdi)
    137 ; X64-NEXT:    movq %rdi, %rax
    138 ; X64-NEXT:    retq
    139 	%a = load %i32vec12, %i32vec12* %ap, align 16
    140 	%b = load %i32vec12, %i32vec12* %bp, align 16
    141 	%x = add %i32vec12 %a, %b
    142 	store %i32vec12 %x, %i32vec12* %ret, align 16
    143 	ret void
    144 }
    145 
    146 
    147 %i16vec3 = type <3 x i16>
    148 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
    149 ; X86-LABEL: add3i16:
    150 ; X86:       # %bb.0:
    151 ; X86-NEXT:    pushl %ebp
    152 ; X86-NEXT:    movl %esp, %ebp
    153 ; X86-NEXT:    andl $-8, %esp
    154 ; X86-NEXT:    subl $24, %esp
    155 ; X86-NEXT:    movl 8(%ebp), %eax
    156 ; X86-NEXT:    movl 16(%ebp), %ecx
    157 ; X86-NEXT:    movl 12(%ebp), %edx
    158 ; X86-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    159 ; X86-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    160 ; X86-NEXT:    pinsrd $2, 4(%edx), %xmm0
    161 ; X86-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    162 ; X86-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    163 ; X86-NEXT:    pinsrd $2, 4(%ecx), %xmm1
    164 ; X86-NEXT:    paddd %xmm0, %xmm1
    165 ; X86-NEXT:    pextrw $4, %xmm1, 4(%eax)
    166 ; X86-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    167 ; X86-NEXT:    movd %xmm1, (%eax)
    168 ; X86-NEXT:    movl %ebp, %esp
    169 ; X86-NEXT:    popl %ebp
    170 ; X86-NEXT:    retl $4
    171 ;
    172 ; X64-LABEL: add3i16:
    173 ; X64:       # %bb.0:
    174 ; X64-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    175 ; X64-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    176 ; X64-NEXT:    paddd %xmm0, %xmm1
    177 ; X64-NEXT:    pextrw $4, %xmm1, 4(%rdi)
    178 ; X64-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    179 ; X64-NEXT:    movd %xmm1, (%rdi)
    180 ; X64-NEXT:    movq %rdi, %rax
    181 ; X64-NEXT:    retq
    182 	%a = load %i16vec3, %i16vec3* %ap, align 16
    183 	%b = load %i16vec3, %i16vec3* %bp, align 16
    184 	%x = add %i16vec3 %a, %b
    185 	store %i16vec3 %x, %i16vec3* %ret, align 16
    186 	ret void
    187 }
    188 
    189 %i16vec4 = type <4 x i16>
    190 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
    191 ; X86-LABEL: add4i16:
    192 ; X86:       # %bb.0:
    193 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    194 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    195 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    196 ; X86-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    197 ; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    198 ; X86-NEXT:    paddw %xmm0, %xmm1
    199 ; X86-NEXT:    movq %xmm1, (%eax)
    200 ; X86-NEXT:    retl $4
    201 ;
    202 ; X64-LABEL: add4i16:
    203 ; X64:       # %bb.0:
    204 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    205 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    206 ; X64-NEXT:    paddw %xmm0, %xmm1
    207 ; X64-NEXT:    movq %xmm1, (%rdi)
    208 ; X64-NEXT:    movq %rdi, %rax
    209 ; X64-NEXT:    retq
    210 	%a = load %i16vec4, %i16vec4* %ap, align 16
    211 	%b = load %i16vec4, %i16vec4* %bp, align 16
    212 	%x = add %i16vec4 %a, %b
    213 	store %i16vec4 %x, %i16vec4* %ret, align 16
    214 	ret void
    215 }
    216 
    217 %i16vec12 = type <12 x i16>
    218 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
    219 ; X86-LABEL: add12i16:
    220 ; X86:       # %bb.0:
    221 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    222 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    223 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    224 ; X86-NEXT:    movdqa (%edx), %xmm0
    225 ; X86-NEXT:    movdqa 16(%edx), %xmm1
    226 ; X86-NEXT:    paddw (%ecx), %xmm0
    227 ; X86-NEXT:    paddw 16(%ecx), %xmm1
    228 ; X86-NEXT:    pextrd $1, %xmm1, 20(%eax)
    229 ; X86-NEXT:    movd %xmm1, 16(%eax)
    230 ; X86-NEXT:    movdqa %xmm0, (%eax)
    231 ; X86-NEXT:    retl $4
    232 ;
    233 ; X64-LABEL: add12i16:
    234 ; X64:       # %bb.0:
    235 ; X64-NEXT:    movdqa (%rsi), %xmm0
    236 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
    237 ; X64-NEXT:    paddw (%rdx), %xmm0
    238 ; X64-NEXT:    paddw 16(%rdx), %xmm1
    239 ; X64-NEXT:    movq %xmm1, 16(%rdi)
    240 ; X64-NEXT:    movdqa %xmm0, (%rdi)
    241 ; X64-NEXT:    movq %rdi, %rax
    242 ; X64-NEXT:    retq
    243 	%a = load %i16vec12, %i16vec12* %ap, align 16
    244 	%b = load %i16vec12, %i16vec12* %bp, align 16
    245 	%x = add %i16vec12 %a, %b
    246 	store %i16vec12 %x, %i16vec12* %ret, align 16
    247 	ret void
    248 }
    249 
    250 %i16vec18 = type <18 x i16>
    251 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
    252 ; X86-LABEL: add18i16:
    253 ; X86:       # %bb.0:
    254 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    255 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    256 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    257 ; X86-NEXT:    movdqa 32(%edx), %xmm0
    258 ; X86-NEXT:    movdqa (%edx), %xmm1
    259 ; X86-NEXT:    movdqa 16(%edx), %xmm2
    260 ; X86-NEXT:    paddw (%ecx), %xmm1
    261 ; X86-NEXT:    paddw 16(%ecx), %xmm2
    262 ; X86-NEXT:    paddw 32(%ecx), %xmm0
    263 ; X86-NEXT:    movd %xmm0, 32(%eax)
    264 ; X86-NEXT:    movdqa %xmm2, 16(%eax)
    265 ; X86-NEXT:    movdqa %xmm1, (%eax)
    266 ; X86-NEXT:    retl $4
    267 ;
    268 ; X64-LABEL: add18i16:
    269 ; X64:       # %bb.0:
    270 ; X64-NEXT:    movdqa (%rsi), %xmm0
    271 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
    272 ; X64-NEXT:    movdqa 32(%rsi), %xmm2
    273 ; X64-NEXT:    paddw (%rdx), %xmm0
    274 ; X64-NEXT:    paddw 16(%rdx), %xmm1
    275 ; X64-NEXT:    paddw 32(%rdx), %xmm2
    276 ; X64-NEXT:    movd %xmm2, 32(%rdi)
    277 ; X64-NEXT:    movdqa %xmm1, 16(%rdi)
    278 ; X64-NEXT:    movdqa %xmm0, (%rdi)
    279 ; X64-NEXT:    movq %rdi, %rax
    280 ; X64-NEXT:    retq
    281 	%a = load %i16vec18, %i16vec18* %ap, align 16
    282 	%b = load %i16vec18, %i16vec18* %bp, align 16
    283 	%x = add %i16vec18 %a, %b
    284 	store %i16vec18 %x, %i16vec18* %ret, align 16
    285 	ret void
    286 }
    287 
    288 
    289 %i8vec3 = type <3 x i8>
    290 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
    291 ; X86-LABEL: add3i8:
    292 ; X86:       # %bb.0:
    293 ; X86-NEXT:    subl $12, %esp
    294 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    295 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    296 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    297 ; X86-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    298 ; X86-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    299 ; X86-NEXT:    paddd %xmm0, %xmm1
    300 ; X86-NEXT:    pextrb $8, %xmm1, 2(%eax)
    301 ; X86-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    302 ; X86-NEXT:    pextrw $0, %xmm1, (%eax)
    303 ; X86-NEXT:    addl $12, %esp
    304 ; X86-NEXT:    retl $4
    305 ;
    306 ; X64-LABEL: add3i8:
    307 ; X64:       # %bb.0:
    308 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    309 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    310 ; X64-NEXT:    paddd %xmm0, %xmm1
    311 ; X64-NEXT:    pextrb $8, %xmm1, 2(%rdi)
    312 ; X64-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    313 ; X64-NEXT:    pextrw $0, %xmm1, (%rdi)
    314 ; X64-NEXT:    movq %rdi, %rax
    315 ; X64-NEXT:    retq
    316 	%a = load %i8vec3, %i8vec3* %ap, align 16
    317 	%b = load %i8vec3, %i8vec3* %bp, align 16
    318 	%x = add %i8vec3 %a, %b
    319 	store %i8vec3 %x, %i8vec3* %ret, align 16
    320 	ret void
    321 }
    322 
    323 %i8vec31 = type <31 x i8>
    324 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
    325 ; X86-LABEL: add31i8:
    326 ; X86:       # %bb.0:
    327 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    328 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    329 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    330 ; X86-NEXT:    movdqa (%edx), %xmm0
    331 ; X86-NEXT:    movdqa 16(%edx), %xmm1
    332 ; X86-NEXT:    paddb (%ecx), %xmm0
    333 ; X86-NEXT:    paddb 16(%ecx), %xmm1
    334 ; X86-NEXT:    pextrb $14, %xmm1, 30(%eax)
    335 ; X86-NEXT:    pextrw $6, %xmm1, 28(%eax)
    336 ; X86-NEXT:    pextrd $2, %xmm1, 24(%eax)
    337 ; X86-NEXT:    pextrd $1, %xmm1, 20(%eax)
    338 ; X86-NEXT:    movd %xmm1, 16(%eax)
    339 ; X86-NEXT:    movdqa %xmm0, (%eax)
    340 ; X86-NEXT:    retl $4
    341 ;
    342 ; X64-LABEL: add31i8:
    343 ; X64:       # %bb.0:
    344 ; X64-NEXT:    movdqa (%rsi), %xmm0
    345 ; X64-NEXT:    movdqa 16(%rsi), %xmm1
    346 ; X64-NEXT:    paddb (%rdx), %xmm0
    347 ; X64-NEXT:    paddb 16(%rdx), %xmm1
    348 ; X64-NEXT:    pextrb $14, %xmm1, 30(%rdi)
    349 ; X64-NEXT:    pextrw $6, %xmm1, 28(%rdi)
    350 ; X64-NEXT:    pextrd $2, %xmm1, 24(%rdi)
    351 ; X64-NEXT:    movq %xmm1, 16(%rdi)
    352 ; X64-NEXT:    movdqa %xmm0, (%rdi)
    353 ; X64-NEXT:    movq %rdi, %rax
    354 ; X64-NEXT:    retq
    355 	%a = load %i8vec31, %i8vec31* %ap, align 16
    356 	%b = load %i8vec31, %i8vec31* %bp, align 16
    357 	%x = add %i8vec31 %a, %b
    358 	store %i8vec31 %x, %i8vec31* %ret, align 16
    359 	ret void
    360 }
    361 
    362 
    363 %i8vec3pack = type { <3 x i8>, i8 }
    364 define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
    365 ; X86-LABEL: rot:
    366 ; X86:       # %bb.0: # %entry
    367 ; X86-NEXT:    subl $16, %esp
    368 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    369 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    370 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
    371 ; X86-NEXT:    movb $-98, 2(%edx)
    372 ; X86-NEXT:    movw $-24930, (%edx) # imm = 0x9E9E
    373 ; X86-NEXT:    movb $1, 2(%ecx)
    374 ; X86-NEXT:    movw $257, (%ecx) # imm = 0x101
    375 ; X86-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    376 ; X86-NEXT:    movdqa %xmm0, %xmm1
    377 ; X86-NEXT:    psrld $1, %xmm1
    378 ; X86-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    379 ; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    380 ; X86-NEXT:    pextrb $8, %xmm1, 2(%eax)
    381 ; X86-NEXT:    pextrw $0, %xmm0, (%eax)
    382 ; X86-NEXT:    addl $16, %esp
    383 ; X86-NEXT:    retl $4
    384 ;
    385 ; X64-LABEL: rot:
    386 ; X64:       # %bb.0: # %entry
    387 ; X64-NEXT:    movb $-98, 2(%rsi)
    388 ; X64-NEXT:    movw $-24930, (%rsi) # imm = 0x9E9E
    389 ; X64-NEXT:    movb $1, 2(%rdx)
    390 ; X64-NEXT:    movw $257, (%rdx) # imm = 0x101
    391 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    392 ; X64-NEXT:    movdqa %xmm0, %xmm1
    393 ; X64-NEXT:    psrld $1, %xmm1
    394 ; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
    395 ; X64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    396 ; X64-NEXT:    pextrb $8, %xmm1, 2(%rdi)
    397 ; X64-NEXT:    pextrw $0, %xmm0, (%rdi)
    398 ; X64-NEXT:    movq %rdi, %rax
    399 ; X64-NEXT:    retq
    400 entry:
    401   %storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
    402   store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
    403   %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>*
    404   store <3 x i8> <i8 1, i8 1, i8 1>, <3 x i8>* %storetmp1
    405   %tmp = load %i8vec3pack, %i8vec3pack* %X
    406   %extractVec = extractvalue %i8vec3pack %tmp, 0
    407   %tmp2 = load %i8vec3pack, %i8vec3pack* %rot
    408   %extractVec3 = extractvalue %i8vec3pack %tmp2, 0
    409   %shr = lshr <3 x i8> %extractVec, %extractVec3
    410   %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>*
    411   store <3 x i8> %shr, <3 x i8>* %storetmp4
    412   ret void
    413 }
    414 
    415