Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,VBMI
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
     12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
     13 
     14 define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
     15 ; SSE3-LABEL: var_shuffle_v2i64:
     16 ; SSE3:       # %bb.0:
     17 ; SSE3-NEXT:    movq %xmm1, %rax
     18 ; SSE3-NEXT:    andl $1, %eax
     19 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     20 ; SSE3-NEXT:    movq %xmm1, %rcx
     21 ; SSE3-NEXT:    andl $1, %ecx
     22 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
     23 ; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
     24 ; SSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
     25 ; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     26 ; SSE3-NEXT:    retq
     27 ;
     28 ; SSSE3-LABEL: var_shuffle_v2i64:
     29 ; SSSE3:       # %bb.0:
     30 ; SSSE3-NEXT:    movq %xmm1, %rax
     31 ; SSSE3-NEXT:    andl $1, %eax
     32 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     33 ; SSSE3-NEXT:    movq %xmm1, %rcx
     34 ; SSSE3-NEXT:    andl $1, %ecx
     35 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
     36 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
     37 ; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
     38 ; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     39 ; SSSE3-NEXT:    retq
     40 ;
     41 ; SSE41-LABEL: var_shuffle_v2i64:
     42 ; SSE41:       # %bb.0:
     43 ; SSE41-NEXT:    pxor %xmm2, %xmm2
     44 ; SSE41-NEXT:    pcmpeqq %xmm1, %xmm2
     45 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
     46 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
     47 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
     48 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
     49 ; SSE41-NEXT:    movapd %xmm1, %xmm0
     50 ; SSE41-NEXT:    retq
     51 ;
     52 ; AVX-LABEL: var_shuffle_v2i64:
     53 ; AVX:       # %bb.0:
     54 ; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
     55 ; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
     56 ; AVX-NEXT:    retq
     57   %index0 = extractelement <2 x i64> %indices, i32 0
     58   %index1 = extractelement <2 x i64> %indices, i32 1
     59   %v0 = extractelement <2 x i64> %v, i64 %index0
     60   %v1 = extractelement <2 x i64> %v, i64 %index1
     61   %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0
     62   %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1
     63   ret <2 x i64> %ret1
     64 }
     65 
     66 define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
     67 ; SSE3-LABEL: var_shuffle_v4i32:
     68 ; SSE3:       # %bb.0:
     69 ; SSE3-NEXT:    movd %xmm1, %eax
     70 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
     71 ; SSE3-NEXT:    movd %xmm2, %ecx
     72 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
     73 ; SSE3-NEXT:    movd %xmm2, %edx
     74 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
     75 ; SSE3-NEXT:    movd %xmm1, %esi
     76 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
     77 ; SSE3-NEXT:    andl $3, %eax
     78 ; SSE3-NEXT:    andl $3, %ecx
     79 ; SSE3-NEXT:    andl $3, %edx
     80 ; SSE3-NEXT:    andl $3, %esi
     81 ; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
     82 ; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     83 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
     84 ; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
     85 ; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
     86 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
     87 ; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     88 ; SSE3-NEXT:    retq
     89 ;
     90 ; SSSE3-LABEL: var_shuffle_v4i32:
     91 ; SSSE3:       # %bb.0:
     92 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
     93 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
     94 ; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
     95 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
     96 ; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
     97 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
     98 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
     99 ; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
    100 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    101 ; SSSE3-NEXT:    retq
    102 ;
    103 ; SSE41-LABEL: var_shuffle_v4i32:
    104 ; SSE41:       # %bb.0:
    105 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
    106 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
    107 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
    108 ; SSE41-NEXT:    retq
    109 ;
    110 ; AVX-LABEL: var_shuffle_v4i32:
    111 ; AVX:       # %bb.0:
    112 ; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
    113 ; AVX-NEXT:    retq
    114   %index0 = extractelement <4 x i32> %indices, i32 0
    115   %index1 = extractelement <4 x i32> %indices, i32 1
    116   %index2 = extractelement <4 x i32> %indices, i32 2
    117   %index3 = extractelement <4 x i32> %indices, i32 3
    118   %v0 = extractelement <4 x i32> %v, i32 %index0
    119   %v1 = extractelement <4 x i32> %v, i32 %index1
    120   %v2 = extractelement <4 x i32> %v, i32 %index2
    121   %v3 = extractelement <4 x i32> %v, i32 %index3
    122   %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0
    123   %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1
    124   %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2
    125   %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3
    126   ret <4 x i32> %ret3
    127 }
    128 
    129 define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
    130 ; SSE3-LABEL: var_shuffle_v8i16:
    131 ; SSE3:       # %bb.0:
    132 ; SSE3-NEXT:    movd %xmm1, %r8d
    133 ; SSE3-NEXT:    pextrw $1, %xmm1, %r9d
    134 ; SSE3-NEXT:    pextrw $2, %xmm1, %r10d
    135 ; SSE3-NEXT:    pextrw $3, %xmm1, %esi
    136 ; SSE3-NEXT:    pextrw $4, %xmm1, %edi
    137 ; SSE3-NEXT:    pextrw $5, %xmm1, %eax
    138 ; SSE3-NEXT:    pextrw $6, %xmm1, %ecx
    139 ; SSE3-NEXT:    pextrw $7, %xmm1, %edx
    140 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
    141 ; SSE3-NEXT:    andl $7, %r8d
    142 ; SSE3-NEXT:    andl $7, %r9d
    143 ; SSE3-NEXT:    andl $7, %r10d
    144 ; SSE3-NEXT:    andl $7, %esi
    145 ; SSE3-NEXT:    andl $7, %edi
    146 ; SSE3-NEXT:    andl $7, %eax
    147 ; SSE3-NEXT:    andl $7, %ecx
    148 ; SSE3-NEXT:    andl $7, %edx
    149 ; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
    150 ; SSE3-NEXT:    movd %edx, %xmm0
    151 ; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
    152 ; SSE3-NEXT:    movd %ecx, %xmm1
    153 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    154 ; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    155 ; SSE3-NEXT:    movd %eax, %xmm0
    156 ; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
    157 ; SSE3-NEXT:    movd %eax, %xmm2
    158 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    159 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    160 ; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
    161 ; SSE3-NEXT:    movd %eax, %xmm0
    162 ; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %eax
    163 ; SSE3-NEXT:    movd %eax, %xmm1
    164 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
    165 ; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
    166 ; SSE3-NEXT:    movd %eax, %xmm3
    167 ; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
    168 ; SSE3-NEXT:    movd %eax, %xmm0
    169 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
    170 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    171 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    172 ; SSE3-NEXT:    retq
    173 ;
    174 ; SSSE3-LABEL: var_shuffle_v8i16:
    175 ; SSSE3:       # %bb.0:
    176 ; SSSE3-NEXT:    pmullw {{.*}}(%rip), %xmm1
    177 ; SSSE3-NEXT:    paddw {{.*}}(%rip), %xmm1
    178 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    179 ; SSSE3-NEXT:    retq
    180 ;
    181 ; SSE41-LABEL: var_shuffle_v8i16:
    182 ; SSE41:       # %bb.0:
    183 ; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
    184 ; SSE41-NEXT:    paddw {{.*}}(%rip), %xmm1
    185 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
    186 ; SSE41-NEXT:    retq
    187 ;
    188 ; AVXNOVLBW-LABEL: var_shuffle_v8i16:
    189 ; AVXNOVLBW:       # %bb.0:
    190 ; AVXNOVLBW-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
    191 ; AVXNOVLBW-NEXT:    vpaddw {{.*}}(%rip), %xmm1, %xmm1
    192 ; AVXNOVLBW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    193 ; AVXNOVLBW-NEXT:    retq
    194 ;
    195 ; AVX512VL-LABEL: var_shuffle_v8i16:
    196 ; AVX512VL:       # %bb.0:
    197 ; AVX512VL-NEXT:    vpermw %xmm0, %xmm1, %xmm0
    198 ; AVX512VL-NEXT:    retq
    199   %index0 = extractelement <8 x i16> %indices, i32 0
    200   %index1 = extractelement <8 x i16> %indices, i32 1
    201   %index2 = extractelement <8 x i16> %indices, i32 2
    202   %index3 = extractelement <8 x i16> %indices, i32 3
    203   %index4 = extractelement <8 x i16> %indices, i32 4
    204   %index5 = extractelement <8 x i16> %indices, i32 5
    205   %index6 = extractelement <8 x i16> %indices, i32 6
    206   %index7 = extractelement <8 x i16> %indices, i32 7
    207   %v0 = extractelement <8 x i16> %v, i16 %index0
    208   %v1 = extractelement <8 x i16> %v, i16 %index1
    209   %v2 = extractelement <8 x i16> %v, i16 %index2
    210   %v3 = extractelement <8 x i16> %v, i16 %index3
    211   %v4 = extractelement <8 x i16> %v, i16 %index4
    212   %v5 = extractelement <8 x i16> %v, i16 %index5
    213   %v6 = extractelement <8 x i16> %v, i16 %index6
    214   %v7 = extractelement <8 x i16> %v, i16 %index7
    215   %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0
    216   %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1
    217   %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2
    218   %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3
    219   %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4
    220   %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5
    221   %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6
    222   %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7
    223   ret <8 x i16> %ret7
    224 }
    225 
    226 define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
    227 ; SSE3-LABEL: var_shuffle_v16i8:
    228 ; SSE3:       # %bb.0:
    229 ; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
    230 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
    231 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    232 ; SSE3-NEXT:    andl $15, %eax
    233 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    234 ; SSE3-NEXT:    movd %eax, %xmm8
    235 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    236 ; SSE3-NEXT:    andl $15, %eax
    237 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    238 ; SSE3-NEXT:    movd %eax, %xmm15
    239 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    240 ; SSE3-NEXT:    andl $15, %eax
    241 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    242 ; SSE3-NEXT:    movd %eax, %xmm9
    243 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    244 ; SSE3-NEXT:    andl $15, %eax
    245 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    246 ; SSE3-NEXT:    movd %eax, %xmm3
    247 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    248 ; SSE3-NEXT:    andl $15, %eax
    249 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    250 ; SSE3-NEXT:    movd %eax, %xmm10
    251 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    252 ; SSE3-NEXT:    andl $15, %eax
    253 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    254 ; SSE3-NEXT:    movd %eax, %xmm7
    255 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    256 ; SSE3-NEXT:    andl $15, %eax
    257 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    258 ; SSE3-NEXT:    movd %eax, %xmm11
    259 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    260 ; SSE3-NEXT:    andl $15, %eax
    261 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    262 ; SSE3-NEXT:    movd %eax, %xmm6
    263 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    264 ; SSE3-NEXT:    andl $15, %eax
    265 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    266 ; SSE3-NEXT:    movd %eax, %xmm12
    267 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    268 ; SSE3-NEXT:    andl $15, %eax
    269 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    270 ; SSE3-NEXT:    movd %eax, %xmm5
    271 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    272 ; SSE3-NEXT:    andl $15, %eax
    273 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    274 ; SSE3-NEXT:    movd %eax, %xmm13
    275 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    276 ; SSE3-NEXT:    andl $15, %eax
    277 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    278 ; SSE3-NEXT:    movd %eax, %xmm4
    279 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    280 ; SSE3-NEXT:    andl $15, %eax
    281 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    282 ; SSE3-NEXT:    movd %eax, %xmm14
    283 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    284 ; SSE3-NEXT:    andl $15, %eax
    285 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    286 ; SSE3-NEXT:    movd %eax, %xmm1
    287 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    288 ; SSE3-NEXT:    andl $15, %eax
    289 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    290 ; SSE3-NEXT:    movd %eax, %xmm2
    291 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    292 ; SSE3-NEXT:    andl $15, %eax
    293 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    294 ; SSE3-NEXT:    movd %eax, %xmm0
    295 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
    296 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
    297 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
    298 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
    299 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
    300 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
    301 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
    302 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
    303 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
    304 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
    305 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
    306 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    307 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    308 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    309 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
    310 ; SSE3-NEXT:    retq
    311 ;
    312 ; SSSE3-LABEL: var_shuffle_v16i8:
    313 ; SSSE3:       # %bb.0:
    314 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    315 ; SSSE3-NEXT:    retq
    316 ;
    317 ; SSE41-LABEL: var_shuffle_v16i8:
    318 ; SSE41:       # %bb.0:
    319 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
    320 ; SSE41-NEXT:    retq
    321 ;
    322 ; AVX-LABEL: var_shuffle_v16i8:
    323 ; AVX:       # %bb.0:
    324 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    325 ; AVX-NEXT:    retq
    326   %index0 = extractelement <16 x i8> %indices, i32 0
    327   %index1 = extractelement <16 x i8> %indices, i32 1
    328   %index2 = extractelement <16 x i8> %indices, i32 2
    329   %index3 = extractelement <16 x i8> %indices, i32 3
    330   %index4 = extractelement <16 x i8> %indices, i32 4
    331   %index5 = extractelement <16 x i8> %indices, i32 5
    332   %index6 = extractelement <16 x i8> %indices, i32 6
    333   %index7 = extractelement <16 x i8> %indices, i32 7
    334   %index8 = extractelement <16 x i8> %indices, i32 8
    335   %index9 = extractelement <16 x i8> %indices, i32 9
    336   %index10 = extractelement <16 x i8> %indices, i32 10
    337   %index11 = extractelement <16 x i8> %indices, i32 11
    338   %index12 = extractelement <16 x i8> %indices, i32 12
    339   %index13 = extractelement <16 x i8> %indices, i32 13
    340   %index14 = extractelement <16 x i8> %indices, i32 14
    341   %index15 = extractelement <16 x i8> %indices, i32 15
    342   %v0 = extractelement <16 x i8> %v, i8 %index0
    343   %v1 = extractelement <16 x i8> %v, i8 %index1
    344   %v2 = extractelement <16 x i8> %v, i8 %index2
    345   %v3 = extractelement <16 x i8> %v, i8 %index3
    346   %v4 = extractelement <16 x i8> %v, i8 %index4
    347   %v5 = extractelement <16 x i8> %v, i8 %index5
    348   %v6 = extractelement <16 x i8> %v, i8 %index6
    349   %v7 = extractelement <16 x i8> %v, i8 %index7
    350   %v8 = extractelement <16 x i8> %v, i8 %index8
    351   %v9 = extractelement <16 x i8> %v, i8 %index9
    352   %v10 = extractelement <16 x i8> %v, i8 %index10
    353   %v11 = extractelement <16 x i8> %v, i8 %index11
    354   %v12 = extractelement <16 x i8> %v, i8 %index12
    355   %v13 = extractelement <16 x i8> %v, i8 %index13
    356   %v14 = extractelement <16 x i8> %v, i8 %index14
    357   %v15 = extractelement <16 x i8> %v, i8 %index15
    358   %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
    359   %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
    360   %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
    361   %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
    362   %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
    363   %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
    364   %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
    365   %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
    366   %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
    367   %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
    368   %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
    369   %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
    370   %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
    371   %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
    372   %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
    373   %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
    374   ret <16 x i8> %ret15
    375 }
    376 
    377 define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
    378 ; SSE3-LABEL: var_shuffle_v2f64:
    379 ; SSE3:       # %bb.0:
    380 ; SSE3-NEXT:    movq %xmm1, %rax
    381 ; SSE3-NEXT:    andl $1, %eax
    382 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    383 ; SSE3-NEXT:    movq %xmm1, %rcx
    384 ; SSE3-NEXT:    andl $1, %ecx
    385 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
    386 ; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    387 ; SSE3-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
    388 ; SSE3-NEXT:    retq
    389 ;
    390 ; SSSE3-LABEL: var_shuffle_v2f64:
    391 ; SSSE3:       # %bb.0:
    392 ; SSSE3-NEXT:    movq %xmm1, %rax
    393 ; SSSE3-NEXT:    andl $1, %eax
    394 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
    395 ; SSSE3-NEXT:    movq %xmm1, %rcx
    396 ; SSSE3-NEXT:    andl $1, %ecx
    397 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
    398 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
    399 ; SSSE3-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
    400 ; SSSE3-NEXT:    retq
    401 ;
    402 ; SSE41-LABEL: var_shuffle_v2f64:
    403 ; SSE41:       # %bb.0:
    404 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    405 ; SSE41-NEXT:    pxor %xmm0, %xmm0
    406 ; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
    407 ; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
    408 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
    409 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
    410 ; SSE41-NEXT:    movapd %xmm2, %xmm0
    411 ; SSE41-NEXT:    retq
    412 ;
    413 ; AVX-LABEL: var_shuffle_v2f64:
    414 ; AVX:       # %bb.0:
    415 ; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
    416 ; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
    417 ; AVX-NEXT:    retq
    418   %index0 = extractelement <2 x i64> %indices, i32 0
    419   %index1 = extractelement <2 x i64> %indices, i32 1
    420   %v0 = extractelement <2 x double> %v, i64 %index0
    421   %v1 = extractelement <2 x double> %v, i64 %index1
    422   %ret0 = insertelement <2 x double> undef, double %v0, i32 0
    423   %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1
    424   ret <2 x double> %ret1
    425 }
    426 
    427 define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
    428 ; SSE3-LABEL: var_shuffle_v4f32:
    429 ; SSE3:       # %bb.0:
    430 ; SSE3-NEXT:    movd %xmm1, %eax
    431 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
    432 ; SSE3-NEXT:    movd %xmm2, %ecx
    433 ; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    434 ; SSE3-NEXT:    movd %xmm2, %edx
    435 ; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    436 ; SSE3-NEXT:    movd %xmm1, %esi
    437 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
    438 ; SSE3-NEXT:    andl $3, %eax
    439 ; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    440 ; SSE3-NEXT:    andl $3, %ecx
    441 ; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    442 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    443 ; SSE3-NEXT:    andl $3, %edx
    444 ; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    445 ; SSE3-NEXT:    andl $3, %esi
    446 ; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    447 ; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    448 ; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    449 ; SSE3-NEXT:    retq
    450 ;
    451 ; SSSE3-LABEL: var_shuffle_v4f32:
    452 ; SSSE3:       # %bb.0:
    453 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
    454 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
    455 ; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
    456 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    457 ; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
    458 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
    459 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    460 ; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
    461 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    462 ; SSSE3-NEXT:    retq
    463 ;
    464 ; SSE41-LABEL: var_shuffle_v4f32:
    465 ; SSE41:       # %bb.0:
    466 ; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
    467 ; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
    468 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
    469 ; SSE41-NEXT:    retq
    470 ;
    471 ; AVX-LABEL: var_shuffle_v4f32:
    472 ; AVX:       # %bb.0:
    473 ; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
    474 ; AVX-NEXT:    retq
    475   %index0 = extractelement <4 x i32> %indices, i32 0
    476   %index1 = extractelement <4 x i32> %indices, i32 1
    477   %index2 = extractelement <4 x i32> %indices, i32 2
    478   %index3 = extractelement <4 x i32> %indices, i32 3
    479   %v0 = extractelement <4 x float> %v, i32 %index0
    480   %v1 = extractelement <4 x float> %v, i32 %index1
    481   %v2 = extractelement <4 x float> %v, i32 %index2
    482   %v3 = extractelement <4 x float> %v, i32 %index3
    483   %ret0 = insertelement <4 x float> undef, float %v0, i32 0
    484   %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1
    485   %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2
    486   %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3
    487   ret <4 x float> %ret3
    488 }
    489 
    490 define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
    491 ; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
    492 ; SSE3:       # %bb.0:
    493 ; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
    494 ; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
    495 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    496 ; SSE3-NEXT:    andl $15, %eax
    497 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    498 ; SSE3-NEXT:    movd %eax, %xmm8
    499 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    500 ; SSE3-NEXT:    andl $15, %eax
    501 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    502 ; SSE3-NEXT:    movd %eax, %xmm15
    503 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    504 ; SSE3-NEXT:    andl $15, %eax
    505 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    506 ; SSE3-NEXT:    movd %eax, %xmm9
    507 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    508 ; SSE3-NEXT:    andl $15, %eax
    509 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    510 ; SSE3-NEXT:    movd %eax, %xmm3
    511 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    512 ; SSE3-NEXT:    andl $15, %eax
    513 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    514 ; SSE3-NEXT:    movd %eax, %xmm10
    515 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    516 ; SSE3-NEXT:    andl $15, %eax
    517 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    518 ; SSE3-NEXT:    movd %eax, %xmm7
    519 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    520 ; SSE3-NEXT:    andl $15, %eax
    521 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    522 ; SSE3-NEXT:    movd %eax, %xmm11
    523 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    524 ; SSE3-NEXT:    andl $15, %eax
    525 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    526 ; SSE3-NEXT:    movd %eax, %xmm6
    527 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    528 ; SSE3-NEXT:    andl $15, %eax
    529 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    530 ; SSE3-NEXT:    movd %eax, %xmm12
    531 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    532 ; SSE3-NEXT:    andl $15, %eax
    533 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    534 ; SSE3-NEXT:    movd %eax, %xmm5
    535 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    536 ; SSE3-NEXT:    andl $15, %eax
    537 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    538 ; SSE3-NEXT:    movd %eax, %xmm13
    539 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    540 ; SSE3-NEXT:    andl $15, %eax
    541 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    542 ; SSE3-NEXT:    movd %eax, %xmm4
    543 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    544 ; SSE3-NEXT:    andl $15, %eax
    545 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    546 ; SSE3-NEXT:    movd %eax, %xmm14
    547 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    548 ; SSE3-NEXT:    andl $15, %eax
    549 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    550 ; SSE3-NEXT:    movd %eax, %xmm1
    551 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    552 ; SSE3-NEXT:    andl $15, %eax
    553 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    554 ; SSE3-NEXT:    movd %eax, %xmm2
    555 ; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
    556 ; SSE3-NEXT:    andl $15, %eax
    557 ; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
    558 ; SSE3-NEXT:    movd %eax, %xmm0
    559 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
    560 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
    561 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
    562 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
    563 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
    564 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
    565 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
    566 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
    567 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
    568 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
    569 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
    570 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    571 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    572 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    573 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
    574 ; SSE3-NEXT:    retq
    575 ;
    576 ; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
    577 ; SSSE3:       # %bb.0:
    578 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
    579 ; SSSE3-NEXT:    retq
    580 ;
    581 ; SSE41-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
    582 ; SSE41:       # %bb.0:
    583 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
    584 ; SSE41-NEXT:    retq
    585 ;
    586 ; AVX-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
    587 ; AVX:       # %bb.0:
    588 ; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    589 ; AVX-NEXT:    vzeroupper
    590 ; AVX-NEXT:    retq
    591   %index0 = extractelement <32 x i8> %indices, i32 0
    592   %index1 = extractelement <32 x i8> %indices, i32 1
    593   %index2 = extractelement <32 x i8> %indices, i32 2
    594   %index3 = extractelement <32 x i8> %indices, i32 3
    595   %index4 = extractelement <32 x i8> %indices, i32 4
    596   %index5 = extractelement <32 x i8> %indices, i32 5
    597   %index6 = extractelement <32 x i8> %indices, i32 6
    598   %index7 = extractelement <32 x i8> %indices, i32 7
    599   %index8 = extractelement <32 x i8> %indices, i32 8
    600   %index9 = extractelement <32 x i8> %indices, i32 9
    601   %index10 = extractelement <32 x i8> %indices, i32 10
    602   %index11 = extractelement <32 x i8> %indices, i32 11
    603   %index12 = extractelement <32 x i8> %indices, i32 12
    604   %index13 = extractelement <32 x i8> %indices, i32 13
    605   %index14 = extractelement <32 x i8> %indices, i32 14
    606   %index15 = extractelement <32 x i8> %indices, i32 15
    607   %v0 = extractelement <16 x i8> %v, i8 %index0
    608   %v1 = extractelement <16 x i8> %v, i8 %index1
    609   %v2 = extractelement <16 x i8> %v, i8 %index2
    610   %v3 = extractelement <16 x i8> %v, i8 %index3
    611   %v4 = extractelement <16 x i8> %v, i8 %index4
    612   %v5 = extractelement <16 x i8> %v, i8 %index5
    613   %v6 = extractelement <16 x i8> %v, i8 %index6
    614   %v7 = extractelement <16 x i8> %v, i8 %index7
    615   %v8 = extractelement <16 x i8> %v, i8 %index8
    616   %v9 = extractelement <16 x i8> %v, i8 %index9
    617   %v10 = extractelement <16 x i8> %v, i8 %index10
    618   %v11 = extractelement <16 x i8> %v, i8 %index11
    619   %v12 = extractelement <16 x i8> %v, i8 %index12
    620   %v13 = extractelement <16 x i8> %v, i8 %index13
    621   %v14 = extractelement <16 x i8> %v, i8 %index14
    622   %v15 = extractelement <16 x i8> %v, i8 %index15
    623   %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
    624   %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
    625   %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
    626   %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
    627   %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
    628   %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
    629   %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
    630   %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
    631   %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
    632   %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
    633   %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
    634   %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
    635   %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
    636   %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
    637   %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
    638   %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
    639   ret <16 x i8> %ret15
    640 }
    641 
    642 define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %indices) nounwind {
    643 ; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
    644 ; SSE3:       # %bb.0:
    645 ; SSE3-NEXT:    pushq %rbp
    646 ; SSE3-NEXT:    movq %rsp, %rbp
    647 ; SSE3-NEXT:    pushq %r15
    648 ; SSE3-NEXT:    pushq %r14
    649 ; SSE3-NEXT:    pushq %r13
    650 ; SSE3-NEXT:    pushq %r12
    651 ; SSE3-NEXT:    pushq %rbx
    652 ; SSE3-NEXT:    andq $-32, %rsp
    653 ; SSE3-NEXT:    subq $608, %rsp # imm = 0x260
    654 ; SSE3-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
    655 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    656 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    657 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    658 ; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
    659 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    660 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    661 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    662 ; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
    663 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    664 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    665 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    666 ; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
    667 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    668 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    669 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
    670 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    671 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    672 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
    673 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    674 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    675 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
    676 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    677 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    678 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
    679 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    680 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    681 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
    682 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    683 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    684 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
    685 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    686 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    687 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
    688 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    689 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    690 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
    691 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    692 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    693 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
    694 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    695 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    696 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
    697 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    698 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    699 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
    700 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    701 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    702 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    703 ; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    704 ; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    705 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r9d
    706 ; SSE3-NEXT:    andl $31, %r9d
    707 ; SSE3-NEXT:    movzbl 64(%rsp,%r9), %ebx
    708 ; SSE3-NEXT:    movd %ebx, %xmm8
    709 ; SSE3-NEXT:    andl $31, %eax
    710 ; SSE3-NEXT:    movzbl 96(%rsp,%rax), %eax
    711 ; SSE3-NEXT:    movd %eax, %xmm15
    712 ; SSE3-NEXT:    andl $31, %edx
    713 ; SSE3-NEXT:    movzbl 128(%rsp,%rdx), %eax
    714 ; SSE3-NEXT:    movd %eax, %xmm9
    715 ; SSE3-NEXT:    andl $31, %ecx
    716 ; SSE3-NEXT:    movzbl 160(%rsp,%rcx), %eax
    717 ; SSE3-NEXT:    movd %eax, %xmm3
    718 ; SSE3-NEXT:    andl $31, %esi
    719 ; SSE3-NEXT:    movzbl 192(%rsp,%rsi), %eax
    720 ; SSE3-NEXT:    movd %eax, %xmm10
    721 ; SSE3-NEXT:    andl $31, %edi
    722 ; SSE3-NEXT:    movzbl 224(%rsp,%rdi), %eax
    723 ; SSE3-NEXT:    movd %eax, %xmm7
    724 ; SSE3-NEXT:    andl $31, %r8d
    725 ; SSE3-NEXT:    movzbl 256(%rsp,%r8), %eax
    726 ; SSE3-NEXT:    movd %eax, %xmm11
    727 ; SSE3-NEXT:    andl $31, %r10d
    728 ; SSE3-NEXT:    movzbl 288(%rsp,%r10), %eax
    729 ; SSE3-NEXT:    movd %eax, %xmm6
    730 ; SSE3-NEXT:    andl $31, %r13d
    731 ; SSE3-NEXT:    movzbl 320(%rsp,%r13), %eax
    732 ; SSE3-NEXT:    movd %eax, %xmm12
    733 ; SSE3-NEXT:    andl $31, %r12d
    734 ; SSE3-NEXT:    movzbl 352(%rsp,%r12), %eax
    735 ; SSE3-NEXT:    movd %eax, %xmm5
    736 ; SSE3-NEXT:    andl $31, %r15d
    737 ; SSE3-NEXT:    movzbl 384(%rsp,%r15), %eax
    738 ; SSE3-NEXT:    movd %eax, %xmm13
    739 ; SSE3-NEXT:    andl $31, %r14d
    740 ; SSE3-NEXT:    movzbl 416(%rsp,%r14), %eax
    741 ; SSE3-NEXT:    movd %eax, %xmm4
    742 ; SSE3-NEXT:    andl $31, %r11d
    743 ; SSE3-NEXT:    movzbl 448(%rsp,%r11), %eax
    744 ; SSE3-NEXT:    movd %eax, %xmm14
    745 ; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
    746 ; SSE3-NEXT:    andl $31, %eax
    747 ; SSE3-NEXT:    movzbl 480(%rsp,%rax), %eax
    748 ; SSE3-NEXT:    movd %eax, %xmm1
    749 ; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
    750 ; SSE3-NEXT:    andl $31, %eax
    751 ; SSE3-NEXT:    movzbl 512(%rsp,%rax), %eax
    752 ; SSE3-NEXT:    movd %eax, %xmm2
    753 ; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
    754 ; SSE3-NEXT:    andl $31, %eax
    755 ; SSE3-NEXT:    movzbl 544(%rsp,%rax), %eax
    756 ; SSE3-NEXT:    movd %eax, %xmm0
    757 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
    758 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
    759 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
    760 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
    761 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
    762 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
    763 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
    764 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
    765 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
    766 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
    767 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
    768 ; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    769 ; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    770 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    771 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
    772 ; SSE3-NEXT:    leaq -40(%rbp), %rsp
    773 ; SSE3-NEXT:    popq %rbx
    774 ; SSE3-NEXT:    popq %r12
    775 ; SSE3-NEXT:    popq %r13
    776 ; SSE3-NEXT:    popq %r14
    777 ; SSE3-NEXT:    popq %r15
    778 ; SSE3-NEXT:    popq %rbp
    779 ; SSE3-NEXT:    retq
    780 ;
    781 ; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
    782 ; SSSE3:       # %bb.0:
    783 ; SSSE3-NEXT:    pushq %rbp
    784 ; SSSE3-NEXT:    movq %rsp, %rbp
    785 ; SSSE3-NEXT:    pushq %r15
    786 ; SSSE3-NEXT:    pushq %r14
    787 ; SSSE3-NEXT:    pushq %r13
    788 ; SSSE3-NEXT:    pushq %r12
    789 ; SSSE3-NEXT:    pushq %rbx
    790 ; SSSE3-NEXT:    andq $-32, %rsp
    791 ; SSSE3-NEXT:    subq $608, %rsp # imm = 0x260
    792 ; SSSE3-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
    793 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    794 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    795 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    796 ; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
    797 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    798 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    799 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    800 ; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
    801 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    802 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    803 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    804 ; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
    805 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    806 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    807 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
    808 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    809 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    810 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
    811 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    812 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    813 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
    814 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    815 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    816 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
    817 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    818 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    819 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
    820 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    821 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    822 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
    823 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    824 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    825 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
    826 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    827 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    828 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
    829 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    830 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    831 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
    832 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    833 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    834 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
    835 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    836 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    837 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
    838 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    839 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    840 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
    841 ; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    842 ; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    843 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r9d
    844 ; SSSE3-NEXT:    andl $31, %r9d
    845 ; SSSE3-NEXT:    movzbl 64(%rsp,%r9), %ebx
    846 ; SSSE3-NEXT:    movd %ebx, %xmm8
    847 ; SSSE3-NEXT:    andl $31, %eax
    848 ; SSSE3-NEXT:    movzbl 96(%rsp,%rax), %eax
    849 ; SSSE3-NEXT:    movd %eax, %xmm15
    850 ; SSSE3-NEXT:    andl $31, %edx
    851 ; SSSE3-NEXT:    movzbl 128(%rsp,%rdx), %eax
    852 ; SSSE3-NEXT:    movd %eax, %xmm9
    853 ; SSSE3-NEXT:    andl $31, %ecx
    854 ; SSSE3-NEXT:    movzbl 160(%rsp,%rcx), %eax
    855 ; SSSE3-NEXT:    movd %eax, %xmm3
    856 ; SSSE3-NEXT:    andl $31, %esi
    857 ; SSSE3-NEXT:    movzbl 192(%rsp,%rsi), %eax
    858 ; SSSE3-NEXT:    movd %eax, %xmm10
    859 ; SSSE3-NEXT:    andl $31, %edi
    860 ; SSSE3-NEXT:    movzbl 224(%rsp,%rdi), %eax
    861 ; SSSE3-NEXT:    movd %eax, %xmm7
    862 ; SSSE3-NEXT:    andl $31, %r8d
    863 ; SSSE3-NEXT:    movzbl 256(%rsp,%r8), %eax
    864 ; SSSE3-NEXT:    movd %eax, %xmm11
    865 ; SSSE3-NEXT:    andl $31, %r10d
    866 ; SSSE3-NEXT:    movzbl 288(%rsp,%r10), %eax
    867 ; SSSE3-NEXT:    movd %eax, %xmm6
    868 ; SSSE3-NEXT:    andl $31, %r13d
    869 ; SSSE3-NEXT:    movzbl 320(%rsp,%r13), %eax
    870 ; SSSE3-NEXT:    movd %eax, %xmm12
    871 ; SSSE3-NEXT:    andl $31, %r12d
    872 ; SSSE3-NEXT:    movzbl 352(%rsp,%r12), %eax
    873 ; SSSE3-NEXT:    movd %eax, %xmm5
    874 ; SSSE3-NEXT:    andl $31, %r15d
    875 ; SSSE3-NEXT:    movzbl 384(%rsp,%r15), %eax
    876 ; SSSE3-NEXT:    movd %eax, %xmm13
    877 ; SSSE3-NEXT:    andl $31, %r14d
    878 ; SSSE3-NEXT:    movzbl 416(%rsp,%r14), %eax
    879 ; SSSE3-NEXT:    movd %eax, %xmm4
    880 ; SSSE3-NEXT:    andl $31, %r11d
    881 ; SSSE3-NEXT:    movzbl 448(%rsp,%r11), %eax
    882 ; SSSE3-NEXT:    movd %eax, %xmm14
    883 ; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
    884 ; SSSE3-NEXT:    andl $31, %eax
    885 ; SSSE3-NEXT:    movzbl 480(%rsp,%rax), %eax
    886 ; SSSE3-NEXT:    movd %eax, %xmm1
    887 ; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
    888 ; SSSE3-NEXT:    andl $31, %eax
    889 ; SSSE3-NEXT:    movzbl 512(%rsp,%rax), %eax
    890 ; SSSE3-NEXT:    movd %eax, %xmm2
    891 ; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
    892 ; SSSE3-NEXT:    andl $31, %eax
    893 ; SSSE3-NEXT:    movzbl 544(%rsp,%rax), %eax
    894 ; SSSE3-NEXT:    movd %eax, %xmm0
    895 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
    896 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
    897 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
    898 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
    899 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
    900 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
    901 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
    902 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
    903 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
    904 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
    905 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
    906 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    907 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    908 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
    909 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
    910 ; SSSE3-NEXT:    leaq -40(%rbp), %rsp
    911 ; SSSE3-NEXT:    popq %rbx
    912 ; SSSE3-NEXT:    popq %r12
    913 ; SSSE3-NEXT:    popq %r13
    914 ; SSSE3-NEXT:    popq %r14
    915 ; SSSE3-NEXT:    popq %r15
    916 ; SSSE3-NEXT:    popq %rbp
    917 ; SSSE3-NEXT:    retq
    918 ;
    919 ; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
    920 ; SSE41:       # %bb.0:
    921 ; SSE41-NEXT:    pushq %rbp
    922 ; SSE41-NEXT:    movq %rsp, %rbp
    923 ; SSE41-NEXT:    andq $-32, %rsp
    924 ; SSE41-NEXT:    subq $544, %rsp # imm = 0x220
    925 ; SSE41-NEXT:    pextrb $0, %xmm2, %eax
    926 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    927 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    928 ; SSE41-NEXT:    andl $31, %eax
    929 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    930 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    931 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    932 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    933 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    934 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    935 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    936 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    937 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    938 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    939 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    940 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    941 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    942 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    943 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    944 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    945 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    946 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    947 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    948 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    949 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    950 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    951 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    952 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    953 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    954 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    955 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    956 ; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
    957 ; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
    958 ; SSE41-NEXT:    movaps %xmm0, (%rsp)
    959 ; SSE41-NEXT:    movzbl 480(%rsp,%rax), %eax
    960 ; SSE41-NEXT:    movd %eax, %xmm0
    961 ; SSE41-NEXT:    pextrb $1, %xmm2, %eax
    962 ; SSE41-NEXT:    andl $31, %eax
    963 ; SSE41-NEXT:    pinsrb $1, 448(%rsp,%rax), %xmm0
    964 ; SSE41-NEXT:    pextrb $2, %xmm2, %eax
    965 ; SSE41-NEXT:    andl $31, %eax
    966 ; SSE41-NEXT:    pinsrb $2, 416(%rsp,%rax), %xmm0
    967 ; SSE41-NEXT:    pextrb $3, %xmm2, %eax
    968 ; SSE41-NEXT:    andl $31, %eax
    969 ; SSE41-NEXT:    pinsrb $3, 384(%rsp,%rax), %xmm0
    970 ; SSE41-NEXT:    pextrb $4, %xmm2, %eax
    971 ; SSE41-NEXT:    andl $31, %eax
    972 ; SSE41-NEXT:    pinsrb $4, 352(%rsp,%rax), %xmm0
    973 ; SSE41-NEXT:    pextrb $5, %xmm2, %eax
    974 ; SSE41-NEXT:    andl $31, %eax
    975 ; SSE41-NEXT:    pinsrb $5, 320(%rsp,%rax), %xmm0
    976 ; SSE41-NEXT:    pextrb $6, %xmm2, %eax
    977 ; SSE41-NEXT:    andl $31, %eax
    978 ; SSE41-NEXT:    pinsrb $6, 288(%rsp,%rax), %xmm0
    979 ; SSE41-NEXT:    pextrb $7, %xmm2, %eax
    980 ; SSE41-NEXT:    andl $31, %eax
    981 ; SSE41-NEXT:    pinsrb $7, 256(%rsp,%rax), %xmm0
    982 ; SSE41-NEXT:    pextrb $8, %xmm2, %eax
    983 ; SSE41-NEXT:    andl $31, %eax
    984 ; SSE41-NEXT:    pinsrb $8, 224(%rsp,%rax), %xmm0
    985 ; SSE41-NEXT:    pextrb $9, %xmm2, %eax
    986 ; SSE41-NEXT:    andl $31, %eax
    987 ; SSE41-NEXT:    pinsrb $9, 192(%rsp,%rax), %xmm0
    988 ; SSE41-NEXT:    pextrb $10, %xmm2, %eax
    989 ; SSE41-NEXT:    andl $31, %eax
    990 ; SSE41-NEXT:    pinsrb $10, 160(%rsp,%rax), %xmm0
    991 ; SSE41-NEXT:    pextrb $11, %xmm2, %eax
    992 ; SSE41-NEXT:    andl $31, %eax
    993 ; SSE41-NEXT:    pinsrb $11, 128(%rsp,%rax), %xmm0
    994 ; SSE41-NEXT:    pextrb $12, %xmm2, %eax
    995 ; SSE41-NEXT:    andl $31, %eax
    996 ; SSE41-NEXT:    pinsrb $12, 96(%rsp,%rax), %xmm0
    997 ; SSE41-NEXT:    pextrb $13, %xmm2, %eax
    998 ; SSE41-NEXT:    andl $31, %eax
    999 ; SSE41-NEXT:    pinsrb $13, 64(%rsp,%rax), %xmm0
   1000 ; SSE41-NEXT:    pextrb $14, %xmm2, %eax
   1001 ; SSE41-NEXT:    andl $31, %eax
   1002 ; SSE41-NEXT:    pinsrb $14, 32(%rsp,%rax), %xmm0
   1003 ; SSE41-NEXT:    pextrb $15, %xmm2, %eax
   1004 ; SSE41-NEXT:    andl $31, %eax
   1005 ; SSE41-NEXT:    pinsrb $15, (%rsp,%rax), %xmm0
   1006 ; SSE41-NEXT:    movq %rbp, %rsp
   1007 ; SSE41-NEXT:    popq %rbp
   1008 ; SSE41-NEXT:    retq
   1009 ;
   1010 ; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
   1011 ; XOP:       # %bb.0:
   1012 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1013 ; XOP-NEXT:    vpperm %xmm1, %xmm2, %xmm0, %xmm0
   1014 ; XOP-NEXT:    vzeroupper
   1015 ; XOP-NEXT:    retq
   1016 ;
   1017 ; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
   1018 ; AVX1:       # %bb.0:
   1019 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1020 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
   1021 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   1022 ; AVX1-NEXT:    vpcmpgtb {{.*}}(%rip), %xmm1, %xmm1
   1023 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
   1024 ; AVX1-NEXT:    vzeroupper
   1025 ; AVX1-NEXT:    retq
   1026 ;
   1027 ; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
   1028 ; AVX2:       # %bb.0:
   1029 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1030 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
   1031 ; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
   1032 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
   1033 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
   1034 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
   1035 ; AVX2-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
   1036 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
   1037 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1038 ; AVX2-NEXT:    vzeroupper
   1039 ; AVX2-NEXT:    retq
   1040 ;
   1041 ; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
   1042 ; AVX512:       # %bb.0:
   1043 ; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1044 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
   1045 ; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
   1046 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
   1047 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
   1048 ; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
   1049 ; AVX512-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
   1050 ; AVX512-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
   1051 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1052 ; AVX512-NEXT:    vzeroupper
   1053 ; AVX512-NEXT:    retq
   1054 ;
   1055 ; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
   1056 ; AVX512VLBW:       # %bb.0:
   1057 ; AVX512VLBW-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1058 ; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
   1059 ; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
   1060 ; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm3
   1061 ; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
   1062 ; AVX512VLBW-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %k1
   1063 ; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2 {%k1}
   1064 ; AVX512VLBW-NEXT:    vmovdqa %xmm2, %xmm0
   1065 ; AVX512VLBW-NEXT:    vzeroupper
   1066 ; AVX512VLBW-NEXT:    retq
   1067 ;
   1068 ; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
   1069 ; VLVBMI:       # %bb.0:
   1070 ; VLVBMI-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1071 ; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
   1072 ; VLVBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1073 ; VLVBMI-NEXT:    vzeroupper
   1074 ; VLVBMI-NEXT:    retq
   1075   %index0 = extractelement <16 x i8> %indices, i32 0
   1076   %index1 = extractelement <16 x i8> %indices, i32 1
   1077   %index2 = extractelement <16 x i8> %indices, i32 2
   1078   %index3 = extractelement <16 x i8> %indices, i32 3
   1079   %index4 = extractelement <16 x i8> %indices, i32 4
   1080   %index5 = extractelement <16 x i8> %indices, i32 5
   1081   %index6 = extractelement <16 x i8> %indices, i32 6
   1082   %index7 = extractelement <16 x i8> %indices, i32 7
   1083   %index8 = extractelement <16 x i8> %indices, i32 8
   1084   %index9 = extractelement <16 x i8> %indices, i32 9
   1085   %index10 = extractelement <16 x i8> %indices, i32 10
   1086   %index11 = extractelement <16 x i8> %indices, i32 11
   1087   %index12 = extractelement <16 x i8> %indices, i32 12
   1088   %index13 = extractelement <16 x i8> %indices, i32 13
   1089   %index14 = extractelement <16 x i8> %indices, i32 14
   1090   %index15 = extractelement <16 x i8> %indices, i32 15
   1091   %v0 = extractelement <32 x i8> %v, i8 %index0
   1092   %v1 = extractelement <32 x i8> %v, i8 %index1
   1093   %v2 = extractelement <32 x i8> %v, i8 %index2
   1094   %v3 = extractelement <32 x i8> %v, i8 %index3
   1095   %v4 = extractelement <32 x i8> %v, i8 %index4
   1096   %v5 = extractelement <32 x i8> %v, i8 %index5
   1097   %v6 = extractelement <32 x i8> %v, i8 %index6
   1098   %v7 = extractelement <32 x i8> %v, i8 %index7
   1099   %v8 = extractelement <32 x i8> %v, i8 %index8
   1100   %v9 = extractelement <32 x i8> %v, i8 %index9
   1101   %v10 = extractelement <32 x i8> %v, i8 %index10
   1102   %v11 = extractelement <32 x i8> %v, i8 %index11
   1103   %v12 = extractelement <32 x i8> %v, i8 %index12
   1104   %v13 = extractelement <32 x i8> %v, i8 %index13
   1105   %v14 = extractelement <32 x i8> %v, i8 %index14
   1106   %v15 = extractelement <32 x i8> %v, i8 %index15
   1107   %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
   1108   %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
   1109   %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
   1110   %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
   1111   %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
   1112   %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
   1113   %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
   1114   %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
   1115   %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
   1116   %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
   1117   %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
   1118   %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
   1119   %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
   1120   %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
   1121   %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
   1122   %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
   1123   ret <16 x i8> %ret15
   1124 }
   1125