Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      4 
      5 ;
      6 ; Unary shuffle indices from registers
      7 ;
      8 
      9 define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
     10 ; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
     11 ; ALL:       # BB#0:
     12 ; ALL-NEXT:    pushq %rbp
     13 ; ALL-NEXT:    movq %rsp, %rbp
     14 ; ALL-NEXT:    andq $-32, %rsp
     15 ; ALL-NEXT:    subq $64, %rsp
     16 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
     17 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
     18 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
     19 ; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
     20 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
     21 ; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     22 ; ALL-NEXT:    movq %rbp, %rsp
     23 ; ALL-NEXT:    popq %rbp
     24 ; ALL-NEXT:    retq
     25   %x0 = extractelement <4 x double> %x, i64 %i0
     26   %x1 = extractelement <4 x double> %x, i64 %i1
     27   %x2 = extractelement <4 x double> %x, i64 %i2
     28   %x3 = extractelement <4 x double> %x, i64 %i3
     29   %r0 = insertelement <4 x double> undef, double %x0, i32 0
     30   %r1 = insertelement <4 x double>   %r0, double %x1, i32 1
     31   %r2 = insertelement <4 x double>   %r1, double %x2, i32 2
     32   %r3 = insertelement <4 x double>   %r2, double %x3, i32 3
     33   ret <4 x double> %r3
     34 }
     35 
     36 define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
     37 ; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
     38 ; ALL:       # BB#0:
     39 ; ALL-NEXT:    pushq %rbp
     40 ; ALL-NEXT:    movq %rsp, %rbp
     41 ; ALL-NEXT:    andq $-32, %rsp
     42 ; ALL-NEXT:    subq $64, %rsp
     43 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
     44 ; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
     45 ; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
     46 ; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     47 ; ALL-NEXT:    movq %rbp, %rsp
     48 ; ALL-NEXT:    popq %rbp
     49 ; ALL-NEXT:    retq
     50   %x0 = extractelement <4 x double> %x, i64 %i0
     51   %x1 = extractelement <4 x double> %x, i64 %i1
     52   %x2 = extractelement <4 x double> %x, i64 %i2
     53   %x3 = extractelement <4 x double> %x, i64 %i3
     54   %r0 = insertelement <4 x double> undef, double undef, i32 0
     55   %r1 = insertelement <4 x double>   %r0, double   %x1, i32 1
     56   %r2 = insertelement <4 x double>   %r1, double   %x2, i32 2
     57   %r3 = insertelement <4 x double>   %r2, double   0.0, i32 3
     58   ret <4 x double> %r3
     59 }
     60 
     61 define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
     62 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
     63 ; ALL:       # BB#0:
     64 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
     65 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
     66 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
     67 ; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
     68 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
     69 ; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     70 ; ALL-NEXT:    retq
     71   %x0 = extractelement <2 x double> %x, i64 %i0
     72   %x1 = extractelement <2 x double> %x, i64 %i1
     73   %x2 = extractelement <2 x double> %x, i64 %i2
     74   %x3 = extractelement <2 x double> %x, i64 %i3
     75   %r0 = insertelement <4 x double> undef, double %x0, i32 0
     76   %r1 = insertelement <4 x double>   %r0, double %x1, i32 1
     77   %r2 = insertelement <4 x double>   %r1, double %x2, i32 2
     78   %r3 = insertelement <4 x double>   %r2, double %x3, i32 3
     79   ret <4 x double> %r3
     80 }
     81 
     82 define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
     83 ; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
     84 ; AVX1:       # BB#0:
     85 ; AVX1-NEXT:    pushq %rbp
     86 ; AVX1-NEXT:    movq %rsp, %rbp
     87 ; AVX1-NEXT:    andq $-32, %rsp
     88 ; AVX1-NEXT:    subq $64, %rsp
     89 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
     90 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
     91 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
     92 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     93 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
     94 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
     95 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
     96 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     97 ; AVX1-NEXT:    movq %rbp, %rsp
     98 ; AVX1-NEXT:    popq %rbp
     99 ; AVX1-NEXT:    retq
    100 ;
    101 ; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
    102 ; AVX2:       # BB#0:
    103 ; AVX2-NEXT:    pushq %rbp
    104 ; AVX2-NEXT:    movq %rsp, %rbp
    105 ; AVX2-NEXT:    andq $-32, %rsp
    106 ; AVX2-NEXT:    subq $64, %rsp
    107 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
    108 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    109 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    110 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    111 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    112 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    113 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    114 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    115 ; AVX2-NEXT:    movq %rbp, %rsp
    116 ; AVX2-NEXT:    popq %rbp
    117 ; AVX2-NEXT:    retq
    118   %x0 = extractelement <4 x i64> %x, i64 %i0
    119   %x1 = extractelement <4 x i64> %x, i64 %i1
    120   %x2 = extractelement <4 x i64> %x, i64 %i2
    121   %x3 = extractelement <4 x i64> %x, i64 %i3
    122   %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
    123   %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
    124   %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
    125   %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
    126   ret <4 x i64> %r3
    127 }
    128 
    129 define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
    130 ; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
    131 ; AVX1:       # BB#0:
    132 ; AVX1-NEXT:    pushq %rbp
    133 ; AVX1-NEXT:    movq %rsp, %rbp
    134 ; AVX1-NEXT:    andq $-32, %rsp
    135 ; AVX1-NEXT:    subq $64, %rsp
    136 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
    137 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    138 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    139 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    140 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    141 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    142 ; AVX1-NEXT:    movq %rbp, %rsp
    143 ; AVX1-NEXT:    popq %rbp
    144 ; AVX1-NEXT:    retq
    145 ;
    146 ; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
    147 ; AVX2:       # BB#0:
    148 ; AVX2-NEXT:    pushq %rbp
    149 ; AVX2-NEXT:    movq %rsp, %rbp
    150 ; AVX2-NEXT:    andq $-32, %rsp
    151 ; AVX2-NEXT:    subq $64, %rsp
    152 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
    153 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    154 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    155 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    156 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    157 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    158 ; AVX2-NEXT:    movq %rbp, %rsp
    159 ; AVX2-NEXT:    popq %rbp
    160 ; AVX2-NEXT:    retq
    161   %x0 = extractelement <4 x i64> %x, i64 %i0
    162   %x1 = extractelement <4 x i64> %x, i64 %i1
    163   %x2 = extractelement <4 x i64> %x, i64 %i2
    164   %x3 = extractelement <4 x i64> %x, i64 %i3
    165   %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
    166   %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
    167   %r2 = insertelement <4 x i64>   %r1, i64   0, i32 2
    168   %r3 = insertelement <4 x i64>   %r2, i64   0, i32 3
    169   ret <4 x i64> %r3
    170 }
    171 
    172 define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
    173 ; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
    174 ; AVX1:       # BB#0:
    175 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
    176 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    177 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    178 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    179 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    180 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    181 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    182 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    183 ; AVX1-NEXT:    retq
    184 ;
    185 ; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
    186 ; AVX2:       # BB#0:
    187 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
    188 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    189 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    190 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    191 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    192 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    193 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    194 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    195 ; AVX2-NEXT:    retq
    196   %x0 = extractelement <2 x i64> %x, i64 %i0
    197   %x1 = extractelement <2 x i64> %x, i64 %i1
    198   %x2 = extractelement <2 x i64> %x, i64 %i2
    199   %x3 = extractelement <2 x i64> %x, i64 %i3
    200   %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
    201   %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
    202   %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
    203   %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
    204   ret <4 x i64> %r3
    205 }
    206 
    207 define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
    208 ; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
    209 ; AVX1:       # BB#0:
    210 ; AVX1-NEXT:    pushq %rbp
    211 ; AVX1-NEXT:    movq %rsp, %rbp
    212 ; AVX1-NEXT:    andq $-32, %rsp
    213 ; AVX1-NEXT:    subq $64, %rsp
    214 ; AVX1-NEXT:    movslq %edi, %rax
    215 ; AVX1-NEXT:    movslq %esi, %rsi
    216 ; AVX1-NEXT:    movslq %edx, %rdx
    217 ; AVX1-NEXT:    movslq %ecx, %r11
    218 ; AVX1-NEXT:    movslq %r8d, %r10
    219 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
    220 ; AVX1-NEXT:    movslq %r9d, %r8
    221 ; AVX1-NEXT:    movslq 16(%rbp), %rdi
    222 ; AVX1-NEXT:    movslq 24(%rbp), %rcx
    223 ; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    224 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    225 ; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    226 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
    227 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
    228 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
    229 ; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
    230 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
    231 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
    232 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    233 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    234 ; AVX1-NEXT:    movq %rbp, %rsp
    235 ; AVX1-NEXT:    popq %rbp
    236 ; AVX1-NEXT:    retq
    237 ;
    238 ; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
    239 ; AVX2:       # BB#0:
    240 ; AVX2-NEXT:    vmovd %edi, %xmm1
    241 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm1
    242 ; AVX2-NEXT:    vmovd %esi, %xmm2
    243 ; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm2
    244 ; AVX2-NEXT:    vmovd %edx, %xmm3
    245 ; AVX2-NEXT:    vpermps %ymm0, %ymm3, %ymm3
    246 ; AVX2-NEXT:    vmovd %ecx, %xmm4
    247 ; AVX2-NEXT:    vpermps %ymm0, %ymm4, %ymm4
    248 ; AVX2-NEXT:    vmovd %r8d, %xmm5
    249 ; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm5
    250 ; AVX2-NEXT:    vmovd %r9d, %xmm6
    251 ; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm6
    252 ; AVX2-NEXT:    vmovd {{.*#+}} xmm7 = mem[0],zero,zero,zero
    253 ; AVX2-NEXT:    vpermps %ymm0, %ymm7, %ymm7
    254 ; AVX2-NEXT:    vmovd {{.*#+}} xmm8 = mem[0],zero,zero,zero
    255 ; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
    256 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
    257 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
    258 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
    259 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
    260 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
    261 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
    262 ; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    263 ; AVX2-NEXT:    retq
    264   %x0 = extractelement <8 x float> %x, i32 %i0
    265   %x1 = extractelement <8 x float> %x, i32 %i1
    266   %x2 = extractelement <8 x float> %x, i32 %i2
    267   %x3 = extractelement <8 x float> %x, i32 %i3
    268   %x4 = extractelement <8 x float> %x, i32 %i4
    269   %x5 = extractelement <8 x float> %x, i32 %i5
    270   %x6 = extractelement <8 x float> %x, i32 %i6
    271   %x7 = extractelement <8 x float> %x, i32 %i7
    272   %r0 = insertelement <8 x float> undef, float %x0, i32 0
    273   %r1 = insertelement <8 x float>   %r0, float %x1, i32 1
    274   %r2 = insertelement <8 x float>   %r1, float %x2, i32 2
    275   %r3 = insertelement <8 x float>   %r2, float %x3, i32 3
    276   %r4 = insertelement <8 x float>   %r3, float %x4, i32 4
    277   %r5 = insertelement <8 x float>   %r4, float %x5, i32 5
    278   %r6 = insertelement <8 x float>   %r5, float %x6, i32 6
    279   %r7 = insertelement <8 x float>   %r6, float %x7, i32 7
    280   ret <8 x float> %r7
    281 }
    282 
    283 define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
    284 ; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
    285 ; ALL:       # BB#0:
    286 ; ALL-NEXT:    movslq %edi, %rax
    287 ; ALL-NEXT:    movslq %esi, %rsi
    288 ; ALL-NEXT:    movslq %edx, %rdx
    289 ; ALL-NEXT:    movslq %ecx, %r11
    290 ; ALL-NEXT:    movslq %r8d, %r10
    291 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
    292 ; ALL-NEXT:    movslq %r9d, %r8
    293 ; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rdi
    294 ; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rcx
    295 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    296 ; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    297 ; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    298 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
    299 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
    300 ; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
    301 ; ALL-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
    302 ; ALL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
    303 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
    304 ; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
    305 ; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    306 ; ALL-NEXT:    retq
    307   %x0 = extractelement <4 x float> %x, i32 %i0
    308   %x1 = extractelement <4 x float> %x, i32 %i1
    309   %x2 = extractelement <4 x float> %x, i32 %i2
    310   %x3 = extractelement <4 x float> %x, i32 %i3
    311   %x4 = extractelement <4 x float> %x, i32 %i4
    312   %x5 = extractelement <4 x float> %x, i32 %i5
    313   %x6 = extractelement <4 x float> %x, i32 %i6
    314   %x7 = extractelement <4 x float> %x, i32 %i7
    315   %r0 = insertelement <8 x float> undef, float %x0, i32 0
    316   %r1 = insertelement <8 x float>   %r0, float %x1, i32 1
    317   %r2 = insertelement <8 x float>   %r1, float %x2, i32 2
    318   %r3 = insertelement <8 x float>   %r2, float %x3, i32 3
    319   %r4 = insertelement <8 x float>   %r3, float %x4, i32 4
    320   %r5 = insertelement <8 x float>   %r4, float %x5, i32 5
    321   %r6 = insertelement <8 x float>   %r5, float %x6, i32 6
    322   %r7 = insertelement <8 x float>   %r6, float %x7, i32 7
    323   ret <8 x float> %r7
    324 }
    325 
    326 define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
    327 ; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
    328 ; AVX1:       # BB#0:
    329 ; AVX1-NEXT:    pushq %rbp
    330 ; AVX1-NEXT:    movq %rsp, %rbp
    331 ; AVX1-NEXT:    andq $-32, %rsp
    332 ; AVX1-NEXT:    subq $64, %rsp
    333 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
    334 ; AVX1-NEXT:    movslq 32(%rbp), %rax
    335 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    336 ; AVX1-NEXT:    vmovd %eax, %xmm0
    337 ; AVX1-NEXT:    movslq 40(%rbp), %rax
    338 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    339 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
    340 ; AVX1-NEXT:    movslq 48(%rbp), %rax
    341 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    342 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
    343 ; AVX1-NEXT:    movslq 56(%rbp), %rax
    344 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    345 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
    346 ; AVX1-NEXT:    movslq 64(%rbp), %rax
    347 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    348 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
    349 ; AVX1-NEXT:    movslq 72(%rbp), %rax
    350 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    351 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
    352 ; AVX1-NEXT:    movslq 80(%rbp), %rax
    353 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    354 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
    355 ; AVX1-NEXT:    movslq 88(%rbp), %rax
    356 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    357 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
    358 ; AVX1-NEXT:    movslq %edi, %rax
    359 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    360 ; AVX1-NEXT:    vmovd %eax, %xmm1
    361 ; AVX1-NEXT:    movslq %esi, %rax
    362 ; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
    363 ; AVX1-NEXT:    movslq %edx, %rax
    364 ; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
    365 ; AVX1-NEXT:    movslq %ecx, %rax
    366 ; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
    367 ; AVX1-NEXT:    movslq %r8d, %rax
    368 ; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
    369 ; AVX1-NEXT:    movslq %r9d, %rax
    370 ; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
    371 ; AVX1-NEXT:    movslq 16(%rbp), %rax
    372 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    373 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
    374 ; AVX1-NEXT:    movslq 24(%rbp), %rax
    375 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
    376 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
    377 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    378 ; AVX1-NEXT:    movq %rbp, %rsp
    379 ; AVX1-NEXT:    popq %rbp
    380 ; AVX1-NEXT:    retq
    381 ;
    382 ; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
    383 ; AVX2:       # BB#0:
    384 ; AVX2-NEXT:    pushq %rbp
    385 ; AVX2-NEXT:    movq %rsp, %rbp
    386 ; AVX2-NEXT:    andq $-32, %rsp
    387 ; AVX2-NEXT:    subq $64, %rsp
    388 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
    389 ; AVX2-NEXT:    movslq 32(%rbp), %rax
    390 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    391 ; AVX2-NEXT:    vmovd %eax, %xmm0
    392 ; AVX2-NEXT:    movslq 40(%rbp), %rax
    393 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    394 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
    395 ; AVX2-NEXT:    movslq 48(%rbp), %rax
    396 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    397 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
    398 ; AVX2-NEXT:    movslq 56(%rbp), %rax
    399 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    400 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
    401 ; AVX2-NEXT:    movslq 64(%rbp), %rax
    402 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    403 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
    404 ; AVX2-NEXT:    movslq 72(%rbp), %rax
    405 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    406 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
    407 ; AVX2-NEXT:    movslq 80(%rbp), %rax
    408 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    409 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
    410 ; AVX2-NEXT:    movslq 88(%rbp), %rax
    411 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    412 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
    413 ; AVX2-NEXT:    movslq %edi, %rax
    414 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    415 ; AVX2-NEXT:    vmovd %eax, %xmm1
    416 ; AVX2-NEXT:    movslq %esi, %rax
    417 ; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
    418 ; AVX2-NEXT:    movslq %edx, %rax
    419 ; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
    420 ; AVX2-NEXT:    movslq %ecx, %rax
    421 ; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
    422 ; AVX2-NEXT:    movslq %r8d, %rax
    423 ; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
    424 ; AVX2-NEXT:    movslq %r9d, %rax
    425 ; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
    426 ; AVX2-NEXT:    movslq 16(%rbp), %rax
    427 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    428 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
    429 ; AVX2-NEXT:    movslq 24(%rbp), %rax
    430 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
    431 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
    432 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    433 ; AVX2-NEXT:    movq %rbp, %rsp
    434 ; AVX2-NEXT:    popq %rbp
    435 ; AVX2-NEXT:    retq
    436   %x0  = extractelement <16 x i16> %x, i32 %i0
    437   %x1  = extractelement <16 x i16> %x, i32 %i1
    438   %x2  = extractelement <16 x i16> %x, i32 %i2
    439   %x3  = extractelement <16 x i16> %x, i32 %i3
    440   %x4  = extractelement <16 x i16> %x, i32 %i4
    441   %x5  = extractelement <16 x i16> %x, i32 %i5
    442   %x6  = extractelement <16 x i16> %x, i32 %i6
    443   %x7  = extractelement <16 x i16> %x, i32 %i7
    444   %x8  = extractelement <16 x i16> %x, i32 %i8
    445   %x9  = extractelement <16 x i16> %x, i32 %i9
    446   %x10 = extractelement <16 x i16> %x, i32 %i10
    447   %x11 = extractelement <16 x i16> %x, i32 %i11
    448   %x12 = extractelement <16 x i16> %x, i32 %i12
    449   %x13 = extractelement <16 x i16> %x, i32 %i13
    450   %x14 = extractelement <16 x i16> %x, i32 %i14
    451   %x15 = extractelement <16 x i16> %x, i32 %i15
    452   %r0  = insertelement <16 x i16> undef, i16 %x0 , i32 0
    453   %r1  = insertelement <16 x i16>  %r0 , i16 %x1 , i32 1
    454   %r2  = insertelement <16 x i16>  %r1 , i16 %x2 , i32 2
    455   %r3  = insertelement <16 x i16>  %r2 , i16 %x3 , i32 3
    456   %r4  = insertelement <16 x i16>  %r3 , i16 %x4 , i32 4
    457   %r5  = insertelement <16 x i16>  %r4 , i16 %x5 , i32 5
    458   %r6  = insertelement <16 x i16>  %r5 , i16 %x6 , i32 6
    459   %r7  = insertelement <16 x i16>  %r6 , i16 %x7 , i32 7
    460   %r8  = insertelement <16 x i16>  %r7 , i16 %x8 , i32 8
    461   %r9  = insertelement <16 x i16>  %r8 , i16 %x9 , i32 9
    462   %r10 = insertelement <16 x i16>  %r9 , i16 %x10, i32 10
    463   %r11 = insertelement <16 x i16>  %r10, i16 %x11, i32 11
    464   %r12 = insertelement <16 x i16>  %r11, i16 %x12, i32 12
    465   %r13 = insertelement <16 x i16>  %r12, i16 %x13, i32 13
    466   %r14 = insertelement <16 x i16>  %r13, i16 %x14, i32 14
    467   %r15 = insertelement <16 x i16>  %r14, i16 %x15, i32 15
    468   ret <16 x i16> %r15
    469 }
    470 
    471 define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
    472 ; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
    473 ; AVX1:       # BB#0:
    474 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
    475 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    476 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    477 ; AVX1-NEXT:    vmovd %eax, %xmm0
    478 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    479 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    480 ; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
    481 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    482 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    483 ; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
    484 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    485 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    486 ; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
    487 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    488 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    489 ; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
    490 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    491 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    492 ; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
    493 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    494 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    495 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
    496 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    497 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    498 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
    499 ; AVX1-NEXT:    movslq %edi, %rax
    500 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    501 ; AVX1-NEXT:    vmovd %eax, %xmm1
    502 ; AVX1-NEXT:    movslq %esi, %rax
    503 ; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
    504 ; AVX1-NEXT:    movslq %edx, %rax
    505 ; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
    506 ; AVX1-NEXT:    movslq %ecx, %rax
    507 ; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
    508 ; AVX1-NEXT:    movslq %r8d, %rax
    509 ; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
    510 ; AVX1-NEXT:    movslq %r9d, %rax
    511 ; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
    512 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    513 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    514 ; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
    515 ; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    516 ; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    517 ; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
    518 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
    519 ; AVX1-NEXT:    retq
    520 ;
    521 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
    522 ; AVX2:       # BB#0:
    523 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
    524 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    525 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    526 ; AVX2-NEXT:    vmovd %eax, %xmm0
    527 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    528 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    529 ; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
    530 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    531 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    532 ; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
    533 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    534 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    535 ; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
    536 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    537 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    538 ; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
    539 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    540 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    541 ; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
    542 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    543 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    544 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
    545 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    546 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    547 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
    548 ; AVX2-NEXT:    movslq %edi, %rax
    549 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    550 ; AVX2-NEXT:    vmovd %eax, %xmm1
    551 ; AVX2-NEXT:    movslq %esi, %rax
    552 ; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
    553 ; AVX2-NEXT:    movslq %edx, %rax
    554 ; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
    555 ; AVX2-NEXT:    movslq %ecx, %rax
    556 ; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
    557 ; AVX2-NEXT:    movslq %r8d, %rax
    558 ; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
    559 ; AVX2-NEXT:    movslq %r9d, %rax
    560 ; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
    561 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    562 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    563 ; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
    564 ; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
    565 ; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
    566 ; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
    567 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
    568 ; AVX2-NEXT:    retq
    569   %x0  = extractelement <8 x i16> %x, i32 %i0
    570   %x1  = extractelement <8 x i16> %x, i32 %i1
    571   %x2  = extractelement <8 x i16> %x, i32 %i2
    572   %x3  = extractelement <8 x i16> %x, i32 %i3
    573   %x4  = extractelement <8 x i16> %x, i32 %i4
    574   %x5  = extractelement <8 x i16> %x, i32 %i5
    575   %x6  = extractelement <8 x i16> %x, i32 %i6
    576   %x7  = extractelement <8 x i16> %x, i32 %i7
    577   %x8  = extractelement <8 x i16> %x, i32 %i8
    578   %x9  = extractelement <8 x i16> %x, i32 %i9
    579   %x10 = extractelement <8 x i16> %x, i32 %i10
    580   %x11 = extractelement <8 x i16> %x, i32 %i11
    581   %x12 = extractelement <8 x i16> %x, i32 %i12
    582   %x13 = extractelement <8 x i16> %x, i32 %i13
    583   %x14 = extractelement <8 x i16> %x, i32 %i14
    584   %x15 = extractelement <8 x i16> %x, i32 %i15
    585   %r0  = insertelement <16 x i16> undef, i16 %x0 , i32 0
    586   %r1  = insertelement <16 x i16>  %r0 , i16 %x1 , i32 1
    587   %r2  = insertelement <16 x i16>  %r1 , i16 %x2 , i32 2
    588   %r3  = insertelement <16 x i16>  %r2 , i16 %x3 , i32 3
    589   %r4  = insertelement <16 x i16>  %r3 , i16 %x4 , i32 4
    590   %r5  = insertelement <16 x i16>  %r4 , i16 %x5 , i32 5
    591   %r6  = insertelement <16 x i16>  %r5 , i16 %x6 , i32 6
    592   %r7  = insertelement <16 x i16>  %r6 , i16 %x7 , i32 7
    593   %r8  = insertelement <16 x i16>  %r7 , i16 %x8 , i32 8
    594   %r9  = insertelement <16 x i16>  %r8 , i16 %x9 , i32 9
    595   %r10 = insertelement <16 x i16>  %r9 , i16 %x10, i32 10
    596   %r11 = insertelement <16 x i16>  %r10, i16 %x11, i32 11
    597   %r12 = insertelement <16 x i16>  %r11, i16 %x12, i32 12
    598   %r13 = insertelement <16 x i16>  %r12, i16 %x13, i32 13
    599   %r14 = insertelement <16 x i16>  %r13, i16 %x14, i32 14
    600   %r15 = insertelement <16 x i16>  %r14, i16 %x15, i32 15
    601   ret <16 x i16> %r15
    602 }
    603 
    604 ;
    605 ; Unary shuffle indices from memory
    606 ;
    607 
    608 define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
    609 ; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
    610 ; AVX1:       # BB#0:
    611 ; AVX1-NEXT:    pushq %rbp
    612 ; AVX1-NEXT:    movq %rsp, %rbp
    613 ; AVX1-NEXT:    andq $-32, %rsp
    614 ; AVX1-NEXT:    subq $64, %rsp
    615 ; AVX1-NEXT:    movq (%rdi), %rax
    616 ; AVX1-NEXT:    movq 8(%rdi), %rcx
    617 ; AVX1-NEXT:    movq 16(%rdi), %rdx
    618 ; AVX1-NEXT:    movq 24(%rdi), %rsi
    619 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
    620 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    621 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    622 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    623 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
    624 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    625 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    626 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    627 ; AVX1-NEXT:    movq %rbp, %rsp
    628 ; AVX1-NEXT:    popq %rbp
    629 ; AVX1-NEXT:    retq
    630 ;
    631 ; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
    632 ; AVX2:       # BB#0:
    633 ; AVX2-NEXT:    pushq %rbp
    634 ; AVX2-NEXT:    movq %rsp, %rbp
    635 ; AVX2-NEXT:    andq $-32, %rsp
    636 ; AVX2-NEXT:    subq $64, %rsp
    637 ; AVX2-NEXT:    movq (%rdi), %rax
    638 ; AVX2-NEXT:    movq 8(%rdi), %rcx
    639 ; AVX2-NEXT:    movq 16(%rdi), %rdx
    640 ; AVX2-NEXT:    movq 24(%rdi), %rsi
    641 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
    642 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    643 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    644 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    645 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
    646 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    647 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    648 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    649 ; AVX2-NEXT:    movq %rbp, %rsp
    650 ; AVX2-NEXT:    popq %rbp
    651 ; AVX2-NEXT:    retq
    652   %p0  = getelementptr inbounds i64, i64* %i, i32 0
    653   %p1  = getelementptr inbounds i64, i64* %i, i32 1
    654   %p2  = getelementptr inbounds i64, i64* %i, i32 2
    655   %p3  = getelementptr inbounds i64, i64* %i, i32 3
    656   %i0  = load i64, i64* %p0, align 4
    657   %i1  = load i64, i64* %p1, align 4
    658   %i2  = load i64, i64* %p2, align 4
    659   %i3  = load i64, i64* %p3, align 4
    660   %x0 = extractelement <4 x i64> %x, i64 %i0
    661   %x1 = extractelement <4 x i64> %x, i64 %i1
    662   %x2 = extractelement <4 x i64> %x, i64 %i2
    663   %x3 = extractelement <4 x i64> %x, i64 %i3
    664   %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
    665   %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
    666   %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
    667   %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
    668   ret <4 x i64> %r3
    669 }
    670 
    671 define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
    672 ; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
    673 ; AVX1:       # BB#0:
    674 ; AVX1-NEXT:    movq (%rdi), %rax
    675 ; AVX1-NEXT:    movq 8(%rdi), %rcx
    676 ; AVX1-NEXT:    movq 16(%rdi), %rdx
    677 ; AVX1-NEXT:    movq 24(%rdi), %rsi
    678 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
    679 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    680 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    681 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    682 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
    683 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    684 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    685 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    686 ; AVX1-NEXT:    retq
    687 ;
    688 ; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
    689 ; AVX2:       # BB#0:
    690 ; AVX2-NEXT:    movq (%rdi), %rax
    691 ; AVX2-NEXT:    movq 8(%rdi), %rcx
    692 ; AVX2-NEXT:    movq 16(%rdi), %rdx
    693 ; AVX2-NEXT:    movq 24(%rdi), %rsi
    694 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
    695 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
    696 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
    697 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    698 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
    699 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
    700 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    701 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    702 ; AVX2-NEXT:    retq
    703   %p0  = getelementptr inbounds i64, i64* %i, i32 0
    704   %p1  = getelementptr inbounds i64, i64* %i, i32 1
    705   %p2  = getelementptr inbounds i64, i64* %i, i32 2
    706   %p3  = getelementptr inbounds i64, i64* %i, i32 3
    707   %i0  = load i64, i64* %p0, align 4
    708   %i1  = load i64, i64* %p1, align 4
    709   %i2  = load i64, i64* %p2, align 4
    710   %i3  = load i64, i64* %p3, align 4
    711   %x0 = extractelement <2 x i64> %x, i64 %i0
    712   %x1 = extractelement <2 x i64> %x, i64 %i1
    713   %x2 = extractelement <2 x i64> %x, i64 %i2
    714   %x3 = extractelement <2 x i64> %x, i64 %i3
    715   %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
    716   %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
    717   %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
    718   %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
    719   ret <4 x i64> %r3
    720 }
    721