Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,NOBW,NOVBMI,AVX512F
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,NOVBMI,AVX512BW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,VBMI
      5 
      6 define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
      7 ; AVX512-LABEL: var_shuffle_v8i64:
      8 ; AVX512:       # %bb.0:
      9 ; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
     10 ; AVX512-NEXT:    retq
     11   %index0 = extractelement <8 x i64> %indices, i32 0
     12   %index1 = extractelement <8 x i64> %indices, i32 1
     13   %index2 = extractelement <8 x i64> %indices, i32 2
     14   %index3 = extractelement <8 x i64> %indices, i32 3
     15   %index4 = extractelement <8 x i64> %indices, i32 4
     16   %index5 = extractelement <8 x i64> %indices, i32 5
     17   %index6 = extractelement <8 x i64> %indices, i32 6
     18   %index7 = extractelement <8 x i64> %indices, i32 7
     19   %v0 = extractelement <8 x i64> %v, i64 %index0
     20   %v1 = extractelement <8 x i64> %v, i64 %index1
     21   %v2 = extractelement <8 x i64> %v, i64 %index2
     22   %v3 = extractelement <8 x i64> %v, i64 %index3
     23   %v4 = extractelement <8 x i64> %v, i64 %index4
     24   %v5 = extractelement <8 x i64> %v, i64 %index5
     25   %v6 = extractelement <8 x i64> %v, i64 %index6
     26   %v7 = extractelement <8 x i64> %v, i64 %index7
     27   %ret0 = insertelement <8 x i64> undef, i64 %v0, i32 0
     28   %ret1 = insertelement <8 x i64> %ret0, i64 %v1, i32 1
     29   %ret2 = insertelement <8 x i64> %ret1, i64 %v2, i32 2
     30   %ret3 = insertelement <8 x i64> %ret2, i64 %v3, i32 3
     31   %ret4 = insertelement <8 x i64> %ret3, i64 %v4, i32 4
     32   %ret5 = insertelement <8 x i64> %ret4, i64 %v5, i32 5
     33   %ret6 = insertelement <8 x i64> %ret5, i64 %v6, i32 6
     34   %ret7 = insertelement <8 x i64> %ret6, i64 %v7, i32 7
     35   ret <8 x i64> %ret7
     36 }
     37 
     38 define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind {
     39 ; AVX512-LABEL: var_shuffle_v16i32:
     40 ; AVX512:       # %bb.0:
     41 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
     42 ; AVX512-NEXT:    retq
     43   %index0 = extractelement <16 x i32> %indices, i32 0
     44   %index1 = extractelement <16 x i32> %indices, i32 1
     45   %index2 = extractelement <16 x i32> %indices, i32 2
     46   %index3 = extractelement <16 x i32> %indices, i32 3
     47   %index4 = extractelement <16 x i32> %indices, i32 4
     48   %index5 = extractelement <16 x i32> %indices, i32 5
     49   %index6 = extractelement <16 x i32> %indices, i32 6
     50   %index7 = extractelement <16 x i32> %indices, i32 7
     51   %index8 = extractelement <16 x i32> %indices, i32 8
     52   %index9 = extractelement <16 x i32> %indices, i32 9
     53   %index10 = extractelement <16 x i32> %indices, i32 10
     54   %index11 = extractelement <16 x i32> %indices, i32 11
     55   %index12 = extractelement <16 x i32> %indices, i32 12
     56   %index13 = extractelement <16 x i32> %indices, i32 13
     57   %index14 = extractelement <16 x i32> %indices, i32 14
     58   %index15 = extractelement <16 x i32> %indices, i32 15
     59   %v0 = extractelement <16 x i32> %v, i32 %index0
     60   %v1 = extractelement <16 x i32> %v, i32 %index1
     61   %v2 = extractelement <16 x i32> %v, i32 %index2
     62   %v3 = extractelement <16 x i32> %v, i32 %index3
     63   %v4 = extractelement <16 x i32> %v, i32 %index4
     64   %v5 = extractelement <16 x i32> %v, i32 %index5
     65   %v6 = extractelement <16 x i32> %v, i32 %index6
     66   %v7 = extractelement <16 x i32> %v, i32 %index7
     67   %v8 = extractelement <16 x i32> %v, i32 %index8
     68   %v9 = extractelement <16 x i32> %v, i32 %index9
     69   %v10 = extractelement <16 x i32> %v, i32 %index10
     70   %v11 = extractelement <16 x i32> %v, i32 %index11
     71   %v12 = extractelement <16 x i32> %v, i32 %index12
     72   %v13 = extractelement <16 x i32> %v, i32 %index13
     73   %v14 = extractelement <16 x i32> %v, i32 %index14
     74   %v15 = extractelement <16 x i32> %v, i32 %index15
     75   %ret0 = insertelement <16 x i32> undef, i32 %v0, i32 0
     76   %ret1 = insertelement <16 x i32> %ret0, i32 %v1, i32 1
     77   %ret2 = insertelement <16 x i32> %ret1, i32 %v2, i32 2
     78   %ret3 = insertelement <16 x i32> %ret2, i32 %v3, i32 3
     79   %ret4 = insertelement <16 x i32> %ret3, i32 %v4, i32 4
     80   %ret5 = insertelement <16 x i32> %ret4, i32 %v5, i32 5
     81   %ret6 = insertelement <16 x i32> %ret5, i32 %v6, i32 6
     82   %ret7 = insertelement <16 x i32> %ret6, i32 %v7, i32 7
     83   %ret8 = insertelement <16 x i32> %ret7, i32 %v8, i32 8
     84   %ret9 = insertelement <16 x i32> %ret8, i32 %v9, i32 9
     85   %ret10 = insertelement <16 x i32> %ret9, i32 %v10, i32 10
     86   %ret11 = insertelement <16 x i32> %ret10, i32 %v11, i32 11
     87   %ret12 = insertelement <16 x i32> %ret11, i32 %v12, i32 12
     88   %ret13 = insertelement <16 x i32> %ret12, i32 %v13, i32 13
     89   %ret14 = insertelement <16 x i32> %ret13, i32 %v14, i32 14
     90   %ret15 = insertelement <16 x i32> %ret14, i32 %v15, i32 15
     91   ret <16 x i32> %ret15
     92 }
     93 
     94 define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwind {
     95 ; NOBW-LABEL: var_shuffle_v32i16:
     96 ; NOBW:       # %bb.0:
     97 ; NOBW-NEXT:    pushq %rbp
     98 ; NOBW-NEXT:    movq %rsp, %rbp
     99 ; NOBW-NEXT:    andq $-64, %rsp
    100 ; NOBW-NEXT:    subq $2112, %rsp # imm = 0x840
    101 ; NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm4
    102 ; NOBW-NEXT:    vmovd %xmm4, %eax
    103 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    104 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    105 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    106 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    107 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    108 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    109 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    110 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    111 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    112 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    113 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    114 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    115 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    116 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    117 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    118 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    119 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    120 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    121 ; NOBW-NEXT:    andl $31, %eax
    122 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    123 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    124 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    125 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    126 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    127 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    128 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    129 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    130 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    131 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    132 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    133 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    134 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    135 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    136 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    137 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    138 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    139 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    140 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    141 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    142 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    143 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    144 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    145 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    146 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    147 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    148 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    149 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    150 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    151 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    152 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    153 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    154 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    155 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    156 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    157 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    158 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    159 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    160 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    161 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    162 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    163 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    164 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    165 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    166 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    167 ; NOBW-NEXT:    vmovaps %ymm0, (%rsp)
    168 ; NOBW-NEXT:    movzwl 1472(%rsp,%rax,2), %eax
    169 ; NOBW-NEXT:    vmovd %eax, %xmm0
    170 ; NOBW-NEXT:    vpextrw $1, %xmm4, %eax
    171 ; NOBW-NEXT:    andl $31, %eax
    172 ; NOBW-NEXT:    vpinsrw $1, 1408(%rsp,%rax,2), %xmm0, %xmm0
    173 ; NOBW-NEXT:    vpextrw $2, %xmm4, %eax
    174 ; NOBW-NEXT:    andl $31, %eax
    175 ; NOBW-NEXT:    vpinsrw $2, 1344(%rsp,%rax,2), %xmm0, %xmm0
    176 ; NOBW-NEXT:    vpextrw $3, %xmm4, %eax
    177 ; NOBW-NEXT:    andl $31, %eax
    178 ; NOBW-NEXT:    vpinsrw $3, 1280(%rsp,%rax,2), %xmm0, %xmm0
    179 ; NOBW-NEXT:    vpextrw $4, %xmm4, %eax
    180 ; NOBW-NEXT:    andl $31, %eax
    181 ; NOBW-NEXT:    vpinsrw $4, 1216(%rsp,%rax,2), %xmm0, %xmm0
    182 ; NOBW-NEXT:    vpextrw $5, %xmm4, %eax
    183 ; NOBW-NEXT:    andl $31, %eax
    184 ; NOBW-NEXT:    vpinsrw $5, 1152(%rsp,%rax,2), %xmm0, %xmm0
    185 ; NOBW-NEXT:    vpextrw $6, %xmm4, %eax
    186 ; NOBW-NEXT:    andl $31, %eax
    187 ; NOBW-NEXT:    vpinsrw $6, 1088(%rsp,%rax,2), %xmm0, %xmm0
    188 ; NOBW-NEXT:    vpextrw $7, %xmm4, %eax
    189 ; NOBW-NEXT:    andl $31, %eax
    190 ; NOBW-NEXT:    vpinsrw $7, 1024(%rsp,%rax,2), %xmm0, %xmm0
    191 ; NOBW-NEXT:    vmovd %xmm2, %eax
    192 ; NOBW-NEXT:    andl $31, %eax
    193 ; NOBW-NEXT:    movzwl 1984(%rsp,%rax,2), %eax
    194 ; NOBW-NEXT:    vmovd %eax, %xmm1
    195 ; NOBW-NEXT:    vpextrw $1, %xmm2, %eax
    196 ; NOBW-NEXT:    andl $31, %eax
    197 ; NOBW-NEXT:    vpinsrw $1, 1920(%rsp,%rax,2), %xmm1, %xmm1
    198 ; NOBW-NEXT:    vpextrw $2, %xmm2, %eax
    199 ; NOBW-NEXT:    andl $31, %eax
    200 ; NOBW-NEXT:    vpinsrw $2, 1856(%rsp,%rax,2), %xmm1, %xmm1
    201 ; NOBW-NEXT:    vpextrw $3, %xmm2, %eax
    202 ; NOBW-NEXT:    andl $31, %eax
    203 ; NOBW-NEXT:    vpinsrw $3, 1792(%rsp,%rax,2), %xmm1, %xmm1
    204 ; NOBW-NEXT:    vpextrw $4, %xmm2, %eax
    205 ; NOBW-NEXT:    andl $31, %eax
    206 ; NOBW-NEXT:    vpinsrw $4, 1728(%rsp,%rax,2), %xmm1, %xmm1
    207 ; NOBW-NEXT:    vpextrw $5, %xmm2, %eax
    208 ; NOBW-NEXT:    andl $31, %eax
    209 ; NOBW-NEXT:    vpinsrw $5, 1664(%rsp,%rax,2), %xmm1, %xmm1
    210 ; NOBW-NEXT:    vpextrw $6, %xmm2, %eax
    211 ; NOBW-NEXT:    andl $31, %eax
    212 ; NOBW-NEXT:    vpinsrw $6, 1600(%rsp,%rax,2), %xmm1, %xmm1
    213 ; NOBW-NEXT:    vpextrw $7, %xmm2, %eax
    214 ; NOBW-NEXT:    vextracti128 $1, %ymm3, %xmm2
    215 ; NOBW-NEXT:    andl $31, %eax
    216 ; NOBW-NEXT:    vpinsrw $7, 1536(%rsp,%rax,2), %xmm1, %xmm1
    217 ; NOBW-NEXT:    vmovd %xmm2, %eax
    218 ; NOBW-NEXT:    andl $31, %eax
    219 ; NOBW-NEXT:    movzwl 448(%rsp,%rax,2), %eax
    220 ; NOBW-NEXT:    vmovd %eax, %xmm4
    221 ; NOBW-NEXT:    vpextrw $1, %xmm2, %eax
    222 ; NOBW-NEXT:    andl $31, %eax
    223 ; NOBW-NEXT:    vpinsrw $1, 384(%rsp,%rax,2), %xmm4, %xmm4
    224 ; NOBW-NEXT:    vpextrw $2, %xmm2, %eax
    225 ; NOBW-NEXT:    andl $31, %eax
    226 ; NOBW-NEXT:    vpinsrw $2, 320(%rsp,%rax,2), %xmm4, %xmm4
    227 ; NOBW-NEXT:    vpextrw $3, %xmm2, %eax
    228 ; NOBW-NEXT:    andl $31, %eax
    229 ; NOBW-NEXT:    vpinsrw $3, 256(%rsp,%rax,2), %xmm4, %xmm4
    230 ; NOBW-NEXT:    vpextrw $4, %xmm2, %eax
    231 ; NOBW-NEXT:    andl $31, %eax
    232 ; NOBW-NEXT:    vpinsrw $4, 192(%rsp,%rax,2), %xmm4, %xmm4
    233 ; NOBW-NEXT:    vpextrw $5, %xmm2, %eax
    234 ; NOBW-NEXT:    andl $31, %eax
    235 ; NOBW-NEXT:    vpinsrw $5, 128(%rsp,%rax,2), %xmm4, %xmm4
    236 ; NOBW-NEXT:    vpextrw $6, %xmm2, %eax
    237 ; NOBW-NEXT:    andl $31, %eax
    238 ; NOBW-NEXT:    vpinsrw $6, 64(%rsp,%rax,2), %xmm4, %xmm4
    239 ; NOBW-NEXT:    vpextrw $7, %xmm2, %eax
    240 ; NOBW-NEXT:    andl $31, %eax
    241 ; NOBW-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm2
    242 ; NOBW-NEXT:    vmovd %xmm3, %eax
    243 ; NOBW-NEXT:    andl $31, %eax
    244 ; NOBW-NEXT:    movzwl 960(%rsp,%rax,2), %eax
    245 ; NOBW-NEXT:    vmovd %eax, %xmm4
    246 ; NOBW-NEXT:    vpextrw $1, %xmm3, %eax
    247 ; NOBW-NEXT:    andl $31, %eax
    248 ; NOBW-NEXT:    vpinsrw $1, 896(%rsp,%rax,2), %xmm4, %xmm4
    249 ; NOBW-NEXT:    vpextrw $2, %xmm3, %eax
    250 ; NOBW-NEXT:    andl $31, %eax
    251 ; NOBW-NEXT:    vpinsrw $2, 832(%rsp,%rax,2), %xmm4, %xmm4
    252 ; NOBW-NEXT:    vpextrw $3, %xmm3, %eax
    253 ; NOBW-NEXT:    andl $31, %eax
    254 ; NOBW-NEXT:    vpinsrw $3, 768(%rsp,%rax,2), %xmm4, %xmm4
    255 ; NOBW-NEXT:    vpextrw $4, %xmm3, %eax
    256 ; NOBW-NEXT:    andl $31, %eax
    257 ; NOBW-NEXT:    vpinsrw $4, 704(%rsp,%rax,2), %xmm4, %xmm4
    258 ; NOBW-NEXT:    vpextrw $5, %xmm3, %eax
    259 ; NOBW-NEXT:    andl $31, %eax
    260 ; NOBW-NEXT:    vpinsrw $5, 640(%rsp,%rax,2), %xmm4, %xmm4
    261 ; NOBW-NEXT:    vpextrw $6, %xmm3, %eax
    262 ; NOBW-NEXT:    andl $31, %eax
    263 ; NOBW-NEXT:    vpinsrw $6, 576(%rsp,%rax,2), %xmm4, %xmm4
    264 ; NOBW-NEXT:    vpextrw $7, %xmm3, %eax
    265 ; NOBW-NEXT:    andl $31, %eax
    266 ; NOBW-NEXT:    vpinsrw $7, 512(%rsp,%rax,2), %xmm4, %xmm3
    267 ; NOBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    268 ; NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm1
    269 ; NOBW-NEXT:    movq %rbp, %rsp
    270 ; NOBW-NEXT:    popq %rbp
    271 ; NOBW-NEXT:    retq
    272 ;
    273 ; AVX512BW-LABEL: var_shuffle_v32i16:
    274 ; AVX512BW:       # %bb.0:
    275 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
    276 ; AVX512BW-NEXT:    retq
    277   %index0 = extractelement <32 x i16> %indices, i32 0
    278   %index1 = extractelement <32 x i16> %indices, i32 1
    279   %index2 = extractelement <32 x i16> %indices, i32 2
    280   %index3 = extractelement <32 x i16> %indices, i32 3
    281   %index4 = extractelement <32 x i16> %indices, i32 4
    282   %index5 = extractelement <32 x i16> %indices, i32 5
    283   %index6 = extractelement <32 x i16> %indices, i32 6
    284   %index7 = extractelement <32 x i16> %indices, i32 7
    285   %index8 = extractelement <32 x i16> %indices, i32 8
    286   %index9 = extractelement <32 x i16> %indices, i32 9
    287   %index10 = extractelement <32 x i16> %indices, i32 10
    288   %index11 = extractelement <32 x i16> %indices, i32 11
    289   %index12 = extractelement <32 x i16> %indices, i32 12
    290   %index13 = extractelement <32 x i16> %indices, i32 13
    291   %index14 = extractelement <32 x i16> %indices, i32 14
    292   %index15 = extractelement <32 x i16> %indices, i32 15
    293   %index16 = extractelement <32 x i16> %indices, i32 16
    294   %index17 = extractelement <32 x i16> %indices, i32 17
    295   %index18 = extractelement <32 x i16> %indices, i32 18
    296   %index19 = extractelement <32 x i16> %indices, i32 19
    297   %index20 = extractelement <32 x i16> %indices, i32 20
    298   %index21 = extractelement <32 x i16> %indices, i32 21
    299   %index22 = extractelement <32 x i16> %indices, i32 22
    300   %index23 = extractelement <32 x i16> %indices, i32 23
    301   %index24 = extractelement <32 x i16> %indices, i32 24
    302   %index25 = extractelement <32 x i16> %indices, i32 25
    303   %index26 = extractelement <32 x i16> %indices, i32 26
    304   %index27 = extractelement <32 x i16> %indices, i32 27
    305   %index28 = extractelement <32 x i16> %indices, i32 28
    306   %index29 = extractelement <32 x i16> %indices, i32 29
    307   %index30 = extractelement <32 x i16> %indices, i32 30
    308   %index31 = extractelement <32 x i16> %indices, i32 31
    309   %v0 = extractelement <32 x i16> %v, i16 %index0
    310   %v1 = extractelement <32 x i16> %v, i16 %index1
    311   %v2 = extractelement <32 x i16> %v, i16 %index2
    312   %v3 = extractelement <32 x i16> %v, i16 %index3
    313   %v4 = extractelement <32 x i16> %v, i16 %index4
    314   %v5 = extractelement <32 x i16> %v, i16 %index5
    315   %v6 = extractelement <32 x i16> %v, i16 %index6
    316   %v7 = extractelement <32 x i16> %v, i16 %index7
    317   %v8 = extractelement <32 x i16> %v, i16 %index8
    318   %v9 = extractelement <32 x i16> %v, i16 %index9
    319   %v10 = extractelement <32 x i16> %v, i16 %index10
    320   %v11 = extractelement <32 x i16> %v, i16 %index11
    321   %v12 = extractelement <32 x i16> %v, i16 %index12
    322   %v13 = extractelement <32 x i16> %v, i16 %index13
    323   %v14 = extractelement <32 x i16> %v, i16 %index14
    324   %v15 = extractelement <32 x i16> %v, i16 %index15
    325   %v16 = extractelement <32 x i16> %v, i16 %index16
    326   %v17 = extractelement <32 x i16> %v, i16 %index17
    327   %v18 = extractelement <32 x i16> %v, i16 %index18
    328   %v19 = extractelement <32 x i16> %v, i16 %index19
    329   %v20 = extractelement <32 x i16> %v, i16 %index20
    330   %v21 = extractelement <32 x i16> %v, i16 %index21
    331   %v22 = extractelement <32 x i16> %v, i16 %index22
    332   %v23 = extractelement <32 x i16> %v, i16 %index23
    333   %v24 = extractelement <32 x i16> %v, i16 %index24
    334   %v25 = extractelement <32 x i16> %v, i16 %index25
    335   %v26 = extractelement <32 x i16> %v, i16 %index26
    336   %v27 = extractelement <32 x i16> %v, i16 %index27
    337   %v28 = extractelement <32 x i16> %v, i16 %index28
    338   %v29 = extractelement <32 x i16> %v, i16 %index29
    339   %v30 = extractelement <32 x i16> %v, i16 %index30
    340   %v31 = extractelement <32 x i16> %v, i16 %index31
    341   %ret0 = insertelement <32 x i16> undef, i16 %v0, i32 0
    342   %ret1 = insertelement <32 x i16> %ret0, i16 %v1, i32 1
    343   %ret2 = insertelement <32 x i16> %ret1, i16 %v2, i32 2
    344   %ret3 = insertelement <32 x i16> %ret2, i16 %v3, i32 3
    345   %ret4 = insertelement <32 x i16> %ret3, i16 %v4, i32 4
    346   %ret5 = insertelement <32 x i16> %ret4, i16 %v5, i32 5
    347   %ret6 = insertelement <32 x i16> %ret5, i16 %v6, i32 6
    348   %ret7 = insertelement <32 x i16> %ret6, i16 %v7, i32 7
    349   %ret8 = insertelement <32 x i16> %ret7, i16 %v8, i32 8
    350   %ret9 = insertelement <32 x i16> %ret8, i16 %v9, i32 9
    351   %ret10 = insertelement <32 x i16> %ret9, i16 %v10, i32 10
    352   %ret11 = insertelement <32 x i16> %ret10, i16 %v11, i32 11
    353   %ret12 = insertelement <32 x i16> %ret11, i16 %v12, i32 12
    354   %ret13 = insertelement <32 x i16> %ret12, i16 %v13, i32 13
    355   %ret14 = insertelement <32 x i16> %ret13, i16 %v14, i32 14
    356   %ret15 = insertelement <32 x i16> %ret14, i16 %v15, i32 15
    357   %ret16 = insertelement <32 x i16> %ret15, i16 %v16, i32 16
    358   %ret17 = insertelement <32 x i16> %ret16, i16 %v17, i32 17
    359   %ret18 = insertelement <32 x i16> %ret17, i16 %v18, i32 18
    360   %ret19 = insertelement <32 x i16> %ret18, i16 %v19, i32 19
    361   %ret20 = insertelement <32 x i16> %ret19, i16 %v20, i32 20
    362   %ret21 = insertelement <32 x i16> %ret20, i16 %v21, i32 21
    363   %ret22 = insertelement <32 x i16> %ret21, i16 %v22, i32 22
    364   %ret23 = insertelement <32 x i16> %ret22, i16 %v23, i32 23
    365   %ret24 = insertelement <32 x i16> %ret23, i16 %v24, i32 24
    366   %ret25 = insertelement <32 x i16> %ret24, i16 %v25, i32 25
    367   %ret26 = insertelement <32 x i16> %ret25, i16 %v26, i32 26
    368   %ret27 = insertelement <32 x i16> %ret26, i16 %v27, i32 27
    369   %ret28 = insertelement <32 x i16> %ret27, i16 %v28, i32 28
    370   %ret29 = insertelement <32 x i16> %ret28, i16 %v29, i32 29
    371   %ret30 = insertelement <32 x i16> %ret29, i16 %v30, i32 30
    372   %ret31 = insertelement <32 x i16> %ret30, i16 %v31, i32 31
    373   ret <32 x i16> %ret31
    374 }
    375 
    376 define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
    377 ; NOBW-LABEL: var_shuffle_v64i8:
    378 ; NOBW:       # %bb.0:
    379 ; NOBW-NEXT:    pushq %rbp
    380 ; NOBW-NEXT:    movq %rsp, %rbp
    381 ; NOBW-NEXT:    andq $-64, %rsp
    382 ; NOBW-NEXT:    subq $4160, %rsp # imm = 0x1040
    383 ; NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm4
    384 ; NOBW-NEXT:    vpextrb $0, %xmm4, %eax
    385 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    386 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    387 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    388 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    389 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    390 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    391 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    392 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    393 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    394 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    395 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    396 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    397 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    398 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    399 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    400 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    401 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    402 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    403 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    404 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    405 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    406 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    407 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    408 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    409 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    410 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    411 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    412 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    413 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    414 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    415 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    416 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    417 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    418 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    419 ; NOBW-NEXT:    andl $63, %eax
    420 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    421 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    422 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    423 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    424 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    425 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    426 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    427 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    428 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    429 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    430 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    431 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    432 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    433 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    434 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    435 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    436 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    437 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    438 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    439 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    440 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    441 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    442 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    443 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    444 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    445 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    446 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    447 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    448 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    449 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    450 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    451 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    452 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    453 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    454 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    455 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    456 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    457 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    458 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    459 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    460 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    461 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    462 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    463 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    464 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    465 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    466 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    467 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    468 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    469 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    470 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    471 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    472 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    473 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    474 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    475 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    476 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    477 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    478 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    479 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    480 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    481 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    482 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    483 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    484 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    485 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    486 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    487 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    488 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    489 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    490 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    491 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    492 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    493 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    494 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    495 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    496 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    497 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    498 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    499 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    500 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    501 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    502 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    503 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    504 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    505 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    506 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    507 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    508 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    509 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    510 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    511 ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
    512 ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
    513 ; NOBW-NEXT:    vmovaps %ymm0, (%rsp)
    514 ; NOBW-NEXT:    movzbl 3008(%rsp,%rax), %eax
    515 ; NOBW-NEXT:    vmovd %eax, %xmm0
    516 ; NOBW-NEXT:    vpextrb $1, %xmm4, %eax
    517 ; NOBW-NEXT:    andl $63, %eax
    518 ; NOBW-NEXT:    vpinsrb $1, 2944(%rsp,%rax), %xmm0, %xmm0
    519 ; NOBW-NEXT:    vpextrb $2, %xmm4, %eax
    520 ; NOBW-NEXT:    andl $63, %eax
    521 ; NOBW-NEXT:    vpinsrb $2, 2880(%rsp,%rax), %xmm0, %xmm0
    522 ; NOBW-NEXT:    vpextrb $3, %xmm4, %eax
    523 ; NOBW-NEXT:    andl $63, %eax
    524 ; NOBW-NEXT:    vpinsrb $3, 2816(%rsp,%rax), %xmm0, %xmm0
    525 ; NOBW-NEXT:    vpextrb $4, %xmm4, %eax
    526 ; NOBW-NEXT:    andl $63, %eax
    527 ; NOBW-NEXT:    vpinsrb $4, 2752(%rsp,%rax), %xmm0, %xmm0
    528 ; NOBW-NEXT:    vpextrb $5, %xmm4, %eax
    529 ; NOBW-NEXT:    andl $63, %eax
    530 ; NOBW-NEXT:    vpinsrb $5, 2688(%rsp,%rax), %xmm0, %xmm0
    531 ; NOBW-NEXT:    vpextrb $6, %xmm4, %eax
    532 ; NOBW-NEXT:    andl $63, %eax
    533 ; NOBW-NEXT:    vpinsrb $6, 2624(%rsp,%rax), %xmm0, %xmm0
    534 ; NOBW-NEXT:    vpextrb $7, %xmm4, %eax
    535 ; NOBW-NEXT:    andl $63, %eax
    536 ; NOBW-NEXT:    vpinsrb $7, 2560(%rsp,%rax), %xmm0, %xmm0
    537 ; NOBW-NEXT:    vpextrb $8, %xmm4, %eax
    538 ; NOBW-NEXT:    andl $63, %eax
    539 ; NOBW-NEXT:    vpinsrb $8, 2496(%rsp,%rax), %xmm0, %xmm0
    540 ; NOBW-NEXT:    vpextrb $9, %xmm4, %eax
    541 ; NOBW-NEXT:    andl $63, %eax
    542 ; NOBW-NEXT:    vpinsrb $9, 2432(%rsp,%rax), %xmm0, %xmm0
    543 ; NOBW-NEXT:    vpextrb $10, %xmm4, %eax
    544 ; NOBW-NEXT:    andl $63, %eax
    545 ; NOBW-NEXT:    vpinsrb $10, 2368(%rsp,%rax), %xmm0, %xmm0
    546 ; NOBW-NEXT:    vpextrb $11, %xmm4, %eax
    547 ; NOBW-NEXT:    andl $63, %eax
    548 ; NOBW-NEXT:    vpinsrb $11, 2304(%rsp,%rax), %xmm0, %xmm0
    549 ; NOBW-NEXT:    vpextrb $12, %xmm4, %eax
    550 ; NOBW-NEXT:    andl $63, %eax
    551 ; NOBW-NEXT:    vpinsrb $12, 2240(%rsp,%rax), %xmm0, %xmm0
    552 ; NOBW-NEXT:    vpextrb $13, %xmm4, %eax
    553 ; NOBW-NEXT:    andl $63, %eax
    554 ; NOBW-NEXT:    vpinsrb $13, 2176(%rsp,%rax), %xmm0, %xmm0
    555 ; NOBW-NEXT:    vpextrb $14, %xmm4, %eax
    556 ; NOBW-NEXT:    andl $63, %eax
    557 ; NOBW-NEXT:    vpinsrb $14, 2112(%rsp,%rax), %xmm0, %xmm0
    558 ; NOBW-NEXT:    vpextrb $15, %xmm4, %eax
    559 ; NOBW-NEXT:    andl $63, %eax
    560 ; NOBW-NEXT:    vpinsrb $15, 2048(%rsp,%rax), %xmm0, %xmm0
    561 ; NOBW-NEXT:    vpextrb $0, %xmm2, %eax
    562 ; NOBW-NEXT:    andl $63, %eax
    563 ; NOBW-NEXT:    movzbl 4032(%rsp,%rax), %eax
    564 ; NOBW-NEXT:    vmovd %eax, %xmm1
    565 ; NOBW-NEXT:    vpextrb $1, %xmm2, %eax
    566 ; NOBW-NEXT:    andl $63, %eax
    567 ; NOBW-NEXT:    vpinsrb $1, 3968(%rsp,%rax), %xmm1, %xmm1
    568 ; NOBW-NEXT:    vpextrb $2, %xmm2, %eax
    569 ; NOBW-NEXT:    andl $63, %eax
    570 ; NOBW-NEXT:    vpinsrb $2, 3904(%rsp,%rax), %xmm1, %xmm1
    571 ; NOBW-NEXT:    vpextrb $3, %xmm2, %eax
    572 ; NOBW-NEXT:    andl $63, %eax
    573 ; NOBW-NEXT:    vpinsrb $3, 3840(%rsp,%rax), %xmm1, %xmm1
    574 ; NOBW-NEXT:    vpextrb $4, %xmm2, %eax
    575 ; NOBW-NEXT:    andl $63, %eax
    576 ; NOBW-NEXT:    vpinsrb $4, 3776(%rsp,%rax), %xmm1, %xmm1
    577 ; NOBW-NEXT:    vpextrb $5, %xmm2, %eax
    578 ; NOBW-NEXT:    andl $63, %eax
    579 ; NOBW-NEXT:    vpinsrb $5, 3712(%rsp,%rax), %xmm1, %xmm1
    580 ; NOBW-NEXT:    vpextrb $6, %xmm2, %eax
    581 ; NOBW-NEXT:    andl $63, %eax
    582 ; NOBW-NEXT:    vpinsrb $6, 3648(%rsp,%rax), %xmm1, %xmm1
    583 ; NOBW-NEXT:    vpextrb $7, %xmm2, %eax
    584 ; NOBW-NEXT:    andl $63, %eax
    585 ; NOBW-NEXT:    vpinsrb $7, 3584(%rsp,%rax), %xmm1, %xmm1
    586 ; NOBW-NEXT:    vpextrb $8, %xmm2, %eax
    587 ; NOBW-NEXT:    andl $63, %eax
    588 ; NOBW-NEXT:    vpinsrb $8, 3520(%rsp,%rax), %xmm1, %xmm1
    589 ; NOBW-NEXT:    vpextrb $9, %xmm2, %eax
    590 ; NOBW-NEXT:    andl $63, %eax
    591 ; NOBW-NEXT:    vpinsrb $9, 3456(%rsp,%rax), %xmm1, %xmm1
    592 ; NOBW-NEXT:    vpextrb $10, %xmm2, %eax
    593 ; NOBW-NEXT:    andl $63, %eax
    594 ; NOBW-NEXT:    vpinsrb $10, 3392(%rsp,%rax), %xmm1, %xmm1
    595 ; NOBW-NEXT:    vpextrb $11, %xmm2, %eax
    596 ; NOBW-NEXT:    andl $63, %eax
    597 ; NOBW-NEXT:    vpinsrb $11, 3328(%rsp,%rax), %xmm1, %xmm1
    598 ; NOBW-NEXT:    vpextrb $12, %xmm2, %eax
    599 ; NOBW-NEXT:    andl $63, %eax
    600 ; NOBW-NEXT:    vpinsrb $12, 3264(%rsp,%rax), %xmm1, %xmm1
    601 ; NOBW-NEXT:    vpextrb $13, %xmm2, %eax
    602 ; NOBW-NEXT:    andl $63, %eax
    603 ; NOBW-NEXT:    vpinsrb $13, 3200(%rsp,%rax), %xmm1, %xmm1
    604 ; NOBW-NEXT:    vpextrb $14, %xmm2, %eax
    605 ; NOBW-NEXT:    andl $63, %eax
    606 ; NOBW-NEXT:    vpinsrb $14, 3136(%rsp,%rax), %xmm1, %xmm1
    607 ; NOBW-NEXT:    vpextrb $15, %xmm2, %eax
    608 ; NOBW-NEXT:    vextracti128 $1, %ymm3, %xmm2
    609 ; NOBW-NEXT:    andl $63, %eax
    610 ; NOBW-NEXT:    vpinsrb $15, 3072(%rsp,%rax), %xmm1, %xmm1
    611 ; NOBW-NEXT:    vpextrb $0, %xmm2, %eax
    612 ; NOBW-NEXT:    andl $63, %eax
    613 ; NOBW-NEXT:    movzbl 960(%rsp,%rax), %eax
    614 ; NOBW-NEXT:    vmovd %eax, %xmm4
    615 ; NOBW-NEXT:    vpextrb $1, %xmm2, %eax
    616 ; NOBW-NEXT:    andl $63, %eax
    617 ; NOBW-NEXT:    vpinsrb $1, 896(%rsp,%rax), %xmm4, %xmm4
    618 ; NOBW-NEXT:    vpextrb $2, %xmm2, %eax
    619 ; NOBW-NEXT:    andl $63, %eax
    620 ; NOBW-NEXT:    vpinsrb $2, 832(%rsp,%rax), %xmm4, %xmm4
    621 ; NOBW-NEXT:    vpextrb $3, %xmm2, %eax
    622 ; NOBW-NEXT:    andl $63, %eax
    623 ; NOBW-NEXT:    vpinsrb $3, 768(%rsp,%rax), %xmm4, %xmm4
    624 ; NOBW-NEXT:    vpextrb $4, %xmm2, %eax
    625 ; NOBW-NEXT:    andl $63, %eax
    626 ; NOBW-NEXT:    vpinsrb $4, 704(%rsp,%rax), %xmm4, %xmm4
    627 ; NOBW-NEXT:    vpextrb $5, %xmm2, %eax
    628 ; NOBW-NEXT:    andl $63, %eax
    629 ; NOBW-NEXT:    vpinsrb $5, 640(%rsp,%rax), %xmm4, %xmm4
    630 ; NOBW-NEXT:    vpextrb $6, %xmm2, %eax
    631 ; NOBW-NEXT:    andl $63, %eax
    632 ; NOBW-NEXT:    vpinsrb $6, 576(%rsp,%rax), %xmm4, %xmm4
    633 ; NOBW-NEXT:    vpextrb $7, %xmm2, %eax
    634 ; NOBW-NEXT:    andl $63, %eax
    635 ; NOBW-NEXT:    vpinsrb $7, 512(%rsp,%rax), %xmm4, %xmm4
    636 ; NOBW-NEXT:    vpextrb $8, %xmm2, %eax
    637 ; NOBW-NEXT:    andl $63, %eax
    638 ; NOBW-NEXT:    vpinsrb $8, 448(%rsp,%rax), %xmm4, %xmm4
    639 ; NOBW-NEXT:    vpextrb $9, %xmm2, %eax
    640 ; NOBW-NEXT:    andl $63, %eax
    641 ; NOBW-NEXT:    vpinsrb $9, 384(%rsp,%rax), %xmm4, %xmm4
    642 ; NOBW-NEXT:    vpextrb $10, %xmm2, %eax
    643 ; NOBW-NEXT:    andl $63, %eax
    644 ; NOBW-NEXT:    vpinsrb $10, 320(%rsp,%rax), %xmm4, %xmm4
    645 ; NOBW-NEXT:    vpextrb $11, %xmm2, %eax
    646 ; NOBW-NEXT:    andl $63, %eax
    647 ; NOBW-NEXT:    vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4
    648 ; NOBW-NEXT:    vpextrb $12, %xmm2, %eax
    649 ; NOBW-NEXT:    andl $63, %eax
    650 ; NOBW-NEXT:    vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4
    651 ; NOBW-NEXT:    vpextrb $13, %xmm2, %eax
    652 ; NOBW-NEXT:    andl $63, %eax
    653 ; NOBW-NEXT:    vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4
    654 ; NOBW-NEXT:    vpextrb $14, %xmm2, %eax
    655 ; NOBW-NEXT:    andl $63, %eax
    656 ; NOBW-NEXT:    vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4
    657 ; NOBW-NEXT:    vpextrb $15, %xmm2, %eax
    658 ; NOBW-NEXT:    andl $63, %eax
    659 ; NOBW-NEXT:    vpinsrb $15, (%rsp,%rax), %xmm4, %xmm2
    660 ; NOBW-NEXT:    vpextrb $0, %xmm3, %eax
    661 ; NOBW-NEXT:    andl $63, %eax
    662 ; NOBW-NEXT:    movzbl 1984(%rsp,%rax), %eax
    663 ; NOBW-NEXT:    vmovd %eax, %xmm4
    664 ; NOBW-NEXT:    vpextrb $1, %xmm3, %eax
    665 ; NOBW-NEXT:    andl $63, %eax
    666 ; NOBW-NEXT:    vpinsrb $1, 1920(%rsp,%rax), %xmm4, %xmm4
    667 ; NOBW-NEXT:    vpextrb $2, %xmm3, %eax
    668 ; NOBW-NEXT:    andl $63, %eax
    669 ; NOBW-NEXT:    vpinsrb $2, 1856(%rsp,%rax), %xmm4, %xmm4
    670 ; NOBW-NEXT:    vpextrb $3, %xmm3, %eax
    671 ; NOBW-NEXT:    andl $63, %eax
    672 ; NOBW-NEXT:    vpinsrb $3, 1792(%rsp,%rax), %xmm4, %xmm4
    673 ; NOBW-NEXT:    vpextrb $4, %xmm3, %eax
    674 ; NOBW-NEXT:    andl $63, %eax
    675 ; NOBW-NEXT:    vpinsrb $4, 1728(%rsp,%rax), %xmm4, %xmm4
    676 ; NOBW-NEXT:    vpextrb $5, %xmm3, %eax
    677 ; NOBW-NEXT:    andl $63, %eax
    678 ; NOBW-NEXT:    vpinsrb $5, 1664(%rsp,%rax), %xmm4, %xmm4
    679 ; NOBW-NEXT:    vpextrb $6, %xmm3, %eax
    680 ; NOBW-NEXT:    andl $63, %eax
    681 ; NOBW-NEXT:    vpinsrb $6, 1600(%rsp,%rax), %xmm4, %xmm4
    682 ; NOBW-NEXT:    vpextrb $7, %xmm3, %eax
    683 ; NOBW-NEXT:    andl $63, %eax
    684 ; NOBW-NEXT:    vpinsrb $7, 1536(%rsp,%rax), %xmm4, %xmm4
    685 ; NOBW-NEXT:    vpextrb $8, %xmm3, %eax
    686 ; NOBW-NEXT:    andl $63, %eax
    687 ; NOBW-NEXT:    vpinsrb $8, 1472(%rsp,%rax), %xmm4, %xmm4
    688 ; NOBW-NEXT:    vpextrb $9, %xmm3, %eax
    689 ; NOBW-NEXT:    andl $63, %eax
    690 ; NOBW-NEXT:    vpinsrb $9, 1408(%rsp,%rax), %xmm4, %xmm4
    691 ; NOBW-NEXT:    vpextrb $10, %xmm3, %eax
    692 ; NOBW-NEXT:    andl $63, %eax
    693 ; NOBW-NEXT:    vpinsrb $10, 1344(%rsp,%rax), %xmm4, %xmm4
    694 ; NOBW-NEXT:    vpextrb $11, %xmm3, %eax
    695 ; NOBW-NEXT:    andl $63, %eax
    696 ; NOBW-NEXT:    vpinsrb $11, 1280(%rsp,%rax), %xmm4, %xmm4
    697 ; NOBW-NEXT:    vpextrb $12, %xmm3, %eax
    698 ; NOBW-NEXT:    andl $63, %eax
    699 ; NOBW-NEXT:    vpinsrb $12, 1216(%rsp,%rax), %xmm4, %xmm4
    700 ; NOBW-NEXT:    vpextrb $13, %xmm3, %eax
    701 ; NOBW-NEXT:    andl $63, %eax
    702 ; NOBW-NEXT:    vpinsrb $13, 1152(%rsp,%rax), %xmm4, %xmm4
    703 ; NOBW-NEXT:    vpextrb $14, %xmm3, %eax
    704 ; NOBW-NEXT:    andl $63, %eax
    705 ; NOBW-NEXT:    vpinsrb $14, 1088(%rsp,%rax), %xmm4, %xmm4
    706 ; NOBW-NEXT:    vpextrb $15, %xmm3, %eax
    707 ; NOBW-NEXT:    andl $63, %eax
    708 ; NOBW-NEXT:    vpinsrb $15, 1024(%rsp,%rax), %xmm4, %xmm3
    709 ; NOBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    710 ; NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm1
    711 ; NOBW-NEXT:    movq %rbp, %rsp
    712 ; NOBW-NEXT:    popq %rbp
    713 ; NOBW-NEXT:    retq
    714 ;
    715 ; VBMI-LABEL: var_shuffle_v64i8:
    716 ; VBMI:       # %bb.0:
    717 ; VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
    718 ; VBMI-NEXT:    retq
    719   %index0 = extractelement <64 x i8> %indices, i32 0
    720   %index1 = extractelement <64 x i8> %indices, i32 1
    721   %index2 = extractelement <64 x i8> %indices, i32 2
    722   %index3 = extractelement <64 x i8> %indices, i32 3
    723   %index4 = extractelement <64 x i8> %indices, i32 4
    724   %index5 = extractelement <64 x i8> %indices, i32 5
    725   %index6 = extractelement <64 x i8> %indices, i32 6
    726   %index7 = extractelement <64 x i8> %indices, i32 7
    727   %index8 = extractelement <64 x i8> %indices, i32 8
    728   %index9 = extractelement <64 x i8> %indices, i32 9
    729   %index10 = extractelement <64 x i8> %indices, i32 10
    730   %index11 = extractelement <64 x i8> %indices, i32 11
    731   %index12 = extractelement <64 x i8> %indices, i32 12
    732   %index13 = extractelement <64 x i8> %indices, i32 13
    733   %index14 = extractelement <64 x i8> %indices, i32 14
    734   %index15 = extractelement <64 x i8> %indices, i32 15
    735   %index16 = extractelement <64 x i8> %indices, i32 16
    736   %index17 = extractelement <64 x i8> %indices, i32 17
    737   %index18 = extractelement <64 x i8> %indices, i32 18
    738   %index19 = extractelement <64 x i8> %indices, i32 19
    739   %index20 = extractelement <64 x i8> %indices, i32 20
    740   %index21 = extractelement <64 x i8> %indices, i32 21
    741   %index22 = extractelement <64 x i8> %indices, i32 22
    742   %index23 = extractelement <64 x i8> %indices, i32 23
    743   %index24 = extractelement <64 x i8> %indices, i32 24
    744   %index25 = extractelement <64 x i8> %indices, i32 25
    745   %index26 = extractelement <64 x i8> %indices, i32 26
    746   %index27 = extractelement <64 x i8> %indices, i32 27
    747   %index28 = extractelement <64 x i8> %indices, i32 28
    748   %index29 = extractelement <64 x i8> %indices, i32 29
    749   %index30 = extractelement <64 x i8> %indices, i32 30
    750   %index31 = extractelement <64 x i8> %indices, i32 31
    751   %index32 = extractelement <64 x i8> %indices, i32 32
    752   %index33 = extractelement <64 x i8> %indices, i32 33
    753   %index34 = extractelement <64 x i8> %indices, i32 34
    754   %index35 = extractelement <64 x i8> %indices, i32 35
    755   %index36 = extractelement <64 x i8> %indices, i32 36
    756   %index37 = extractelement <64 x i8> %indices, i32 37
    757   %index38 = extractelement <64 x i8> %indices, i32 38
    758   %index39 = extractelement <64 x i8> %indices, i32 39
    759   %index40 = extractelement <64 x i8> %indices, i32 40
    760   %index41 = extractelement <64 x i8> %indices, i32 41
    761   %index42 = extractelement <64 x i8> %indices, i32 42
    762   %index43 = extractelement <64 x i8> %indices, i32 43
    763   %index44 = extractelement <64 x i8> %indices, i32 44
    764   %index45 = extractelement <64 x i8> %indices, i32 45
    765   %index46 = extractelement <64 x i8> %indices, i32 46
    766   %index47 = extractelement <64 x i8> %indices, i32 47
    767   %index48 = extractelement <64 x i8> %indices, i32 48
    768   %index49 = extractelement <64 x i8> %indices, i32 49
    769   %index50 = extractelement <64 x i8> %indices, i32 50
    770   %index51 = extractelement <64 x i8> %indices, i32 51
    771   %index52 = extractelement <64 x i8> %indices, i32 52
    772   %index53 = extractelement <64 x i8> %indices, i32 53
    773   %index54 = extractelement <64 x i8> %indices, i32 54
    774   %index55 = extractelement <64 x i8> %indices, i32 55
    775   %index56 = extractelement <64 x i8> %indices, i32 56
    776   %index57 = extractelement <64 x i8> %indices, i32 57
    777   %index58 = extractelement <64 x i8> %indices, i32 58
    778   %index59 = extractelement <64 x i8> %indices, i32 59
    779   %index60 = extractelement <64 x i8> %indices, i32 60
    780   %index61 = extractelement <64 x i8> %indices, i32 61
    781   %index62 = extractelement <64 x i8> %indices, i32 62
    782   %index63 = extractelement <64 x i8> %indices, i32 63
    783   %v0 = extractelement <64 x i8> %v, i8 %index0
    784   %v1 = extractelement <64 x i8> %v, i8 %index1
    785   %v2 = extractelement <64 x i8> %v, i8 %index2
    786   %v3 = extractelement <64 x i8> %v, i8 %index3
    787   %v4 = extractelement <64 x i8> %v, i8 %index4
    788   %v5 = extractelement <64 x i8> %v, i8 %index5
    789   %v6 = extractelement <64 x i8> %v, i8 %index6
    790   %v7 = extractelement <64 x i8> %v, i8 %index7
    791   %v8 = extractelement <64 x i8> %v, i8 %index8
    792   %v9 = extractelement <64 x i8> %v, i8 %index9
    793   %v10 = extractelement <64 x i8> %v, i8 %index10
    794   %v11 = extractelement <64 x i8> %v, i8 %index11
    795   %v12 = extractelement <64 x i8> %v, i8 %index12
    796   %v13 = extractelement <64 x i8> %v, i8 %index13
    797   %v14 = extractelement <64 x i8> %v, i8 %index14
    798   %v15 = extractelement <64 x i8> %v, i8 %index15
    799   %v16 = extractelement <64 x i8> %v, i8 %index16
    800   %v17 = extractelement <64 x i8> %v, i8 %index17
    801   %v18 = extractelement <64 x i8> %v, i8 %index18
    802   %v19 = extractelement <64 x i8> %v, i8 %index19
    803   %v20 = extractelement <64 x i8> %v, i8 %index20
    804   %v21 = extractelement <64 x i8> %v, i8 %index21
    805   %v22 = extractelement <64 x i8> %v, i8 %index22
    806   %v23 = extractelement <64 x i8> %v, i8 %index23
    807   %v24 = extractelement <64 x i8> %v, i8 %index24
    808   %v25 = extractelement <64 x i8> %v, i8 %index25
    809   %v26 = extractelement <64 x i8> %v, i8 %index26
    810   %v27 = extractelement <64 x i8> %v, i8 %index27
    811   %v28 = extractelement <64 x i8> %v, i8 %index28
    812   %v29 = extractelement <64 x i8> %v, i8 %index29
    813   %v30 = extractelement <64 x i8> %v, i8 %index30
    814   %v31 = extractelement <64 x i8> %v, i8 %index31
    815   %v32 = extractelement <64 x i8> %v, i8 %index32
    816   %v33 = extractelement <64 x i8> %v, i8 %index33
    817   %v34 = extractelement <64 x i8> %v, i8 %index34
    818   %v35 = extractelement <64 x i8> %v, i8 %index35
    819   %v36 = extractelement <64 x i8> %v, i8 %index36
    820   %v37 = extractelement <64 x i8> %v, i8 %index37
    821   %v38 = extractelement <64 x i8> %v, i8 %index38
    822   %v39 = extractelement <64 x i8> %v, i8 %index39
    823   %v40 = extractelement <64 x i8> %v, i8 %index40
    824   %v41 = extractelement <64 x i8> %v, i8 %index41
    825   %v42 = extractelement <64 x i8> %v, i8 %index42
    826   %v43 = extractelement <64 x i8> %v, i8 %index43
    827   %v44 = extractelement <64 x i8> %v, i8 %index44
    828   %v45 = extractelement <64 x i8> %v, i8 %index45
    829   %v46 = extractelement <64 x i8> %v, i8 %index46
    830   %v47 = extractelement <64 x i8> %v, i8 %index47
    831   %v48 = extractelement <64 x i8> %v, i8 %index48
    832   %v49 = extractelement <64 x i8> %v, i8 %index49
    833   %v50 = extractelement <64 x i8> %v, i8 %index50
    834   %v51 = extractelement <64 x i8> %v, i8 %index51
    835   %v52 = extractelement <64 x i8> %v, i8 %index52
    836   %v53 = extractelement <64 x i8> %v, i8 %index53
    837   %v54 = extractelement <64 x i8> %v, i8 %index54
    838   %v55 = extractelement <64 x i8> %v, i8 %index55
    839   %v56 = extractelement <64 x i8> %v, i8 %index56
    840   %v57 = extractelement <64 x i8> %v, i8 %index57
    841   %v58 = extractelement <64 x i8> %v, i8 %index58
    842   %v59 = extractelement <64 x i8> %v, i8 %index59
    843   %v60 = extractelement <64 x i8> %v, i8 %index60
    844   %v61 = extractelement <64 x i8> %v, i8 %index61
    845   %v62 = extractelement <64 x i8> %v, i8 %index62
    846   %v63 = extractelement <64 x i8> %v, i8 %index63
    847   %ret0 = insertelement <64 x i8> undef, i8 %v0, i32 0
    848   %ret1 = insertelement <64 x i8> %ret0, i8 %v1, i32 1
    849   %ret2 = insertelement <64 x i8> %ret1, i8 %v2, i32 2
    850   %ret3 = insertelement <64 x i8> %ret2, i8 %v3, i32 3
    851   %ret4 = insertelement <64 x i8> %ret3, i8 %v4, i32 4
    852   %ret5 = insertelement <64 x i8> %ret4, i8 %v5, i32 5
    853   %ret6 = insertelement <64 x i8> %ret5, i8 %v6, i32 6
    854   %ret7 = insertelement <64 x i8> %ret6, i8 %v7, i32 7
    855   %ret8 = insertelement <64 x i8> %ret7, i8 %v8, i32 8
    856   %ret9 = insertelement <64 x i8> %ret8, i8 %v9, i32 9
    857   %ret10 = insertelement <64 x i8> %ret9, i8 %v10, i32 10
    858   %ret11 = insertelement <64 x i8> %ret10, i8 %v11, i32 11
    859   %ret12 = insertelement <64 x i8> %ret11, i8 %v12, i32 12
    860   %ret13 = insertelement <64 x i8> %ret12, i8 %v13, i32 13
    861   %ret14 = insertelement <64 x i8> %ret13, i8 %v14, i32 14
    862   %ret15 = insertelement <64 x i8> %ret14, i8 %v15, i32 15
    863   %ret16 = insertelement <64 x i8> %ret15, i8 %v16, i32 16
    864   %ret17 = insertelement <64 x i8> %ret16, i8 %v17, i32 17
    865   %ret18 = insertelement <64 x i8> %ret17, i8 %v18, i32 18
    866   %ret19 = insertelement <64 x i8> %ret18, i8 %v19, i32 19
    867   %ret20 = insertelement <64 x i8> %ret19, i8 %v20, i32 20
    868   %ret21 = insertelement <64 x i8> %ret20, i8 %v21, i32 21
    869   %ret22 = insertelement <64 x i8> %ret21, i8 %v22, i32 22
    870   %ret23 = insertelement <64 x i8> %ret22, i8 %v23, i32 23
    871   %ret24 = insertelement <64 x i8> %ret23, i8 %v24, i32 24
    872   %ret25 = insertelement <64 x i8> %ret24, i8 %v25, i32 25
    873   %ret26 = insertelement <64 x i8> %ret25, i8 %v26, i32 26
    874   %ret27 = insertelement <64 x i8> %ret26, i8 %v27, i32 27
    875   %ret28 = insertelement <64 x i8> %ret27, i8 %v28, i32 28
    876   %ret29 = insertelement <64 x i8> %ret28, i8 %v29, i32 29
    877   %ret30 = insertelement <64 x i8> %ret29, i8 %v30, i32 30
    878   %ret31 = insertelement <64 x i8> %ret30, i8 %v31, i32 31
    879   %ret32 = insertelement <64 x i8> %ret31, i8 %v32, i32 32
    880   %ret33 = insertelement <64 x i8> %ret32, i8 %v33, i32 33
    881   %ret34 = insertelement <64 x i8> %ret33, i8 %v34, i32 34
    882   %ret35 = insertelement <64 x i8> %ret34, i8 %v35, i32 35
    883   %ret36 = insertelement <64 x i8> %ret35, i8 %v36, i32 36
    884   %ret37 = insertelement <64 x i8> %ret36, i8 %v37, i32 37
    885   %ret38 = insertelement <64 x i8> %ret37, i8 %v38, i32 38
    886   %ret39 = insertelement <64 x i8> %ret38, i8 %v39, i32 39
    887   %ret40 = insertelement <64 x i8> %ret39, i8 %v40, i32 40
    888   %ret41 = insertelement <64 x i8> %ret40, i8 %v41, i32 41
    889   %ret42 = insertelement <64 x i8> %ret41, i8 %v42, i32 42
    890   %ret43 = insertelement <64 x i8> %ret42, i8 %v43, i32 43
    891   %ret44 = insertelement <64 x i8> %ret43, i8 %v44, i32 44
    892   %ret45 = insertelement <64 x i8> %ret44, i8 %v45, i32 45
    893   %ret46 = insertelement <64 x i8> %ret45, i8 %v46, i32 46
    894   %ret47 = insertelement <64 x i8> %ret46, i8 %v47, i32 47
    895   %ret48 = insertelement <64 x i8> %ret47, i8 %v48, i32 48
    896   %ret49 = insertelement <64 x i8> %ret48, i8 %v49, i32 49
    897   %ret50 = insertelement <64 x i8> %ret49, i8 %v50, i32 50
    898   %ret51 = insertelement <64 x i8> %ret50, i8 %v51, i32 51
    899   %ret52 = insertelement <64 x i8> %ret51, i8 %v52, i32 52
    900   %ret53 = insertelement <64 x i8> %ret52, i8 %v53, i32 53
    901   %ret54 = insertelement <64 x i8> %ret53, i8 %v54, i32 54
    902   %ret55 = insertelement <64 x i8> %ret54, i8 %v55, i32 55
    903   %ret56 = insertelement <64 x i8> %ret55, i8 %v56, i32 56
    904   %ret57 = insertelement <64 x i8> %ret56, i8 %v57, i32 57
    905   %ret58 = insertelement <64 x i8> %ret57, i8 %v58, i32 58
    906   %ret59 = insertelement <64 x i8> %ret58, i8 %v59, i32 59
    907   %ret60 = insertelement <64 x i8> %ret59, i8 %v60, i32 60
    908   %ret61 = insertelement <64 x i8> %ret60, i8 %v61, i32 61
    909   %ret62 = insertelement <64 x i8> %ret61, i8 %v62, i32 62
    910   %ret63 = insertelement <64 x i8> %ret62, i8 %v63, i32 63
    911   ret <64 x i8> %ret63
    912 }
    913 
    914 define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind {
    915 ; AVX512-LABEL: var_shuffle_v8f64:
    916 ; AVX512:       # %bb.0:
    917 ; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
    918 ; AVX512-NEXT:    retq
    919   %index0 = extractelement <8 x i64> %indices, i32 0
    920   %index1 = extractelement <8 x i64> %indices, i32 1
    921   %index2 = extractelement <8 x i64> %indices, i32 2
    922   %index3 = extractelement <8 x i64> %indices, i32 3
    923   %index4 = extractelement <8 x i64> %indices, i32 4
    924   %index5 = extractelement <8 x i64> %indices, i32 5
    925   %index6 = extractelement <8 x i64> %indices, i32 6
    926   %index7 = extractelement <8 x i64> %indices, i32 7
    927   %v0 = extractelement <8 x double> %v, i64 %index0
    928   %v1 = extractelement <8 x double> %v, i64 %index1
    929   %v2 = extractelement <8 x double> %v, i64 %index2
    930   %v3 = extractelement <8 x double> %v, i64 %index3
    931   %v4 = extractelement <8 x double> %v, i64 %index4
    932   %v5 = extractelement <8 x double> %v, i64 %index5
    933   %v6 = extractelement <8 x double> %v, i64 %index6
    934   %v7 = extractelement <8 x double> %v, i64 %index7
    935   %ret0 = insertelement <8 x double> undef, double %v0, i32 0
    936   %ret1 = insertelement <8 x double> %ret0, double %v1, i32 1
    937   %ret2 = insertelement <8 x double> %ret1, double %v2, i32 2
    938   %ret3 = insertelement <8 x double> %ret2, double %v3, i32 3
    939   %ret4 = insertelement <8 x double> %ret3, double %v4, i32 4
    940   %ret5 = insertelement <8 x double> %ret4, double %v5, i32 5
    941   %ret6 = insertelement <8 x double> %ret5, double %v6, i32 6
    942   %ret7 = insertelement <8 x double> %ret6, double %v7, i32 7
    943   ret <8 x double> %ret7
    944 }
    945 
    946 define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind {
    947 ; AVX512-LABEL: var_shuffle_v16f32:
    948 ; AVX512:       # %bb.0:
    949 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
    950 ; AVX512-NEXT:    retq
    951   %index0 = extractelement <16 x i32> %indices, i32 0
    952   %index1 = extractelement <16 x i32> %indices, i32 1
    953   %index2 = extractelement <16 x i32> %indices, i32 2
    954   %index3 = extractelement <16 x i32> %indices, i32 3
    955   %index4 = extractelement <16 x i32> %indices, i32 4
    956   %index5 = extractelement <16 x i32> %indices, i32 5
    957   %index6 = extractelement <16 x i32> %indices, i32 6
    958   %index7 = extractelement <16 x i32> %indices, i32 7
    959   %index8 = extractelement <16 x i32> %indices, i32 8
    960   %index9 = extractelement <16 x i32> %indices, i32 9
    961   %index10 = extractelement <16 x i32> %indices, i32 10
    962   %index11 = extractelement <16 x i32> %indices, i32 11
    963   %index12 = extractelement <16 x i32> %indices, i32 12
    964   %index13 = extractelement <16 x i32> %indices, i32 13
    965   %index14 = extractelement <16 x i32> %indices, i32 14
    966   %index15 = extractelement <16 x i32> %indices, i32 15
    967   %v0 = extractelement <16 x float> %v, i32 %index0
    968   %v1 = extractelement <16 x float> %v, i32 %index1
    969   %v2 = extractelement <16 x float> %v, i32 %index2
    970   %v3 = extractelement <16 x float> %v, i32 %index3
    971   %v4 = extractelement <16 x float> %v, i32 %index4
    972   %v5 = extractelement <16 x float> %v, i32 %index5
    973   %v6 = extractelement <16 x float> %v, i32 %index6
    974   %v7 = extractelement <16 x float> %v, i32 %index7
    975   %v8 = extractelement <16 x float> %v, i32 %index8
    976   %v9 = extractelement <16 x float> %v, i32 %index9
    977   %v10 = extractelement <16 x float> %v, i32 %index10
    978   %v11 = extractelement <16 x float> %v, i32 %index11
    979   %v12 = extractelement <16 x float> %v, i32 %index12
    980   %v13 = extractelement <16 x float> %v, i32 %index13
    981   %v14 = extractelement <16 x float> %v, i32 %index14
    982   %v15 = extractelement <16 x float> %v, i32 %index15
    983   %ret0 = insertelement <16 x float> undef, float %v0, i32 0
    984   %ret1 = insertelement <16 x float> %ret0, float %v1, i32 1
    985   %ret2 = insertelement <16 x float> %ret1, float %v2, i32 2
    986   %ret3 = insertelement <16 x float> %ret2, float %v3, i32 3
    987   %ret4 = insertelement <16 x float> %ret3, float %v4, i32 4
    988   %ret5 = insertelement <16 x float> %ret4, float %v5, i32 5
    989   %ret6 = insertelement <16 x float> %ret5, float %v6, i32 6
    990   %ret7 = insertelement <16 x float> %ret6, float %v7, i32 7
    991   %ret8 = insertelement <16 x float> %ret7, float %v8, i32 8
    992   %ret9 = insertelement <16 x float> %ret8, float %v9, i32 9
    993   %ret10 = insertelement <16 x float> %ret9, float %v10, i32 10
    994   %ret11 = insertelement <16 x float> %ret10, float %v11, i32 11
    995   %ret12 = insertelement <16 x float> %ret11, float %v12, i32 12
    996   %ret13 = insertelement <16 x float> %ret12, float %v13, i32 13
    997   %ret14 = insertelement <16 x float> %ret13, float %v14, i32 14
    998   %ret15 = insertelement <16 x float> %ret14, float %v15, i32 15
    999   ret <16 x float> %ret15
   1000 }
   1001