Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
     11 
     12 define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
     13 ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
     14 ; SSE2:       # %bb.0:
     15 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
     16 ; SSE2-NEXT:    pxor %xmm1, %xmm1
     17 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
     18 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
     19 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
     20 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
     21 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
     22 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
     23 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
     24 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
     25 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
     26 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     27 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
     28 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
     29 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     30 ; SSE2-NEXT:    movq %xmm0, (%rsi)
     31 ; SSE2-NEXT:    retq
     32 ;
     33 ; SSE42-LABEL: shuffle_v16i8_to_v8i8_1:
     34 ; SSE42:       # %bb.0:
     35 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
     36 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
     37 ; SSE42-NEXT:    movq %xmm0, (%rsi)
     38 ; SSE42-NEXT:    retq
     39 ;
     40 ; AVX-LABEL: shuffle_v16i8_to_v8i8_1:
     41 ; AVX:       # %bb.0:
     42 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
     43 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
     44 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
     45 ; AVX-NEXT:    retq
     46 ;
     47 ; AVX512F-LABEL: shuffle_v16i8_to_v8i8_1:
     48 ; AVX512F:       # %bb.0:
     49 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
     50 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
     51 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
     52 ; AVX512F-NEXT:    retq
     53 ;
     54 ; AVX512VL-LABEL: shuffle_v16i8_to_v8i8_1:
     55 ; AVX512VL:       # %bb.0:
     56 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
     57 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
     58 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
     59 ; AVX512VL-NEXT:    retq
     60 ;
     61 ; AVX512BW-LABEL: shuffle_v16i8_to_v8i8_1:
     62 ; AVX512BW:       # %bb.0:
     63 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
     64 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
     65 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
     66 ; AVX512BW-NEXT:    retq
     67 ;
     68 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8_1:
     69 ; AVX512BWVL:       # %bb.0:
     70 ; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %xmm0
     71 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
     72 ; AVX512BWVL-NEXT:    retq
     73   %vec = load <16 x i8>, <16 x i8>* %L
     74   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
     75   store <8 x i8> %strided.vec, <8 x i8>* %S
     76   ret void
     77 }
     78 
     79 define void @shuffle_v8i16_to_v4i16_1(<8 x i16>* %L, <4 x i16>* %S) nounwind {
     80 ; SSE2-LABEL: shuffle_v8i16_to_v4i16_1:
     81 ; SSE2:       # %bb.0:
     82 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
     83 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
     84 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     85 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
     86 ; SSE2-NEXT:    movq %xmm0, (%rsi)
     87 ; SSE2-NEXT:    retq
     88 ;
     89 ; SSE42-LABEL: shuffle_v8i16_to_v4i16_1:
     90 ; SSE42:       # %bb.0:
     91 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
     92 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
     93 ; SSE42-NEXT:    movq %xmm0, (%rsi)
     94 ; SSE42-NEXT:    retq
     95 ;
     96 ; AVX-LABEL: shuffle_v8i16_to_v4i16_1:
     97 ; AVX:       # %bb.0:
     98 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
     99 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
    100 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
    101 ; AVX-NEXT:    retq
    102 ;
    103 ; AVX512F-LABEL: shuffle_v8i16_to_v4i16_1:
    104 ; AVX512F:       # %bb.0:
    105 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    106 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
    107 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    108 ; AVX512F-NEXT:    retq
    109 ;
    110 ; AVX512VL-LABEL: shuffle_v8i16_to_v4i16_1:
    111 ; AVX512VL:       # %bb.0:
    112 ; AVX512VL-NEXT:    vpsrld $16, (%rdi), %xmm0
    113 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
    114 ; AVX512VL-NEXT:    retq
    115 ;
    116 ; AVX512BW-LABEL: shuffle_v8i16_to_v4i16_1:
    117 ; AVX512BW:       # %bb.0:
    118 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    119 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
    120 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    121 ; AVX512BW-NEXT:    retq
    122 ;
    123 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16_1:
    124 ; AVX512BWVL:       # %bb.0:
    125 ; AVX512BWVL-NEXT:    vpsrld $16, (%rdi), %xmm0
    126 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
    127 ; AVX512BWVL-NEXT:    retq
    128   %vec = load <8 x i16>, <8 x i16>* %L
    129   %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
    130   store <4 x i16> %strided.vec, <4 x i16>* %S
    131   ret void
    132 }
    133 
    134 define void @shuffle_v4i32_to_v2i32_1(<4 x i32>* %L, <2 x i32>* %S) nounwind {
    135 ; SSE-LABEL: shuffle_v4i32_to_v2i32_1:
    136 ; SSE:       # %bb.0:
    137 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
    138 ; SSE-NEXT:    movq %xmm0, (%rsi)
    139 ; SSE-NEXT:    retq
    140 ;
    141 ; AVX-LABEL: shuffle_v4i32_to_v2i32_1:
    142 ; AVX:       # %bb.0:
    143 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
    144 ; AVX-NEXT:    vmovlps %xmm0, (%rsi)
    145 ; AVX-NEXT:    retq
    146 ;
    147 ; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1:
    148 ; AVX512F:       # %bb.0:
    149 ; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
    150 ; AVX512F-NEXT:    vmovlps %xmm0, (%rsi)
    151 ; AVX512F-NEXT:    retq
    152 ;
    153 ; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1:
    154 ; AVX512VL:       # %bb.0:
    155 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
    156 ; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
    157 ; AVX512VL-NEXT:    retq
    158 ;
    159 ; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1:
    160 ; AVX512BW:       # %bb.0:
    161 ; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
    162 ; AVX512BW-NEXT:    vmovlps %xmm0, (%rsi)
    163 ; AVX512BW-NEXT:    retq
    164 ;
    165 ; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1:
    166 ; AVX512BWVL:       # %bb.0:
    167 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
    168 ; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
    169 ; AVX512BWVL-NEXT:    retq
    170   %vec = load <4 x i32>, <4 x i32>* %L
    171   %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
    172   store <2 x i32> %strided.vec, <2 x i32>* %S
    173   ret void
    174 }
    175 
    176 define void @shuffle_v16i8_to_v4i8_1(<16 x i8>* %L, <4 x i8>* %S) nounwind {
    177 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_1:
    178 ; SSE2:       # %bb.0:
    179 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    180 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    181 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    182 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    183 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    184 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
    185 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    186 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    187 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    188 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    189 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    190 ; SSE2-NEXT:    movd %xmm0, (%rsi)
    191 ; SSE2-NEXT:    retq
    192 ;
    193 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_1:
    194 ; SSE42:       # %bb.0:
    195 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    196 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
    197 ; SSE42-NEXT:    movd %xmm0, (%rsi)
    198 ; SSE42-NEXT:    retq
    199 ;
    200 ; AVX-LABEL: shuffle_v16i8_to_v4i8_1:
    201 ; AVX:       # %bb.0:
    202 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    203 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
    204 ; AVX-NEXT:    vmovd %xmm0, (%rsi)
    205 ; AVX-NEXT:    retq
    206 ;
    207 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_1:
    208 ; AVX512F:       # %bb.0:
    209 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    210 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
    211 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    212 ; AVX512F-NEXT:    retq
    213 ;
    214 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_1:
    215 ; AVX512VL:       # %bb.0:
    216 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    217 ; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
    218 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    219 ; AVX512VL-NEXT:    retq
    220 ;
    221 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_1:
    222 ; AVX512BW:       # %bb.0:
    223 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    224 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
    225 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    226 ; AVX512BW-NEXT:    retq
    227 ;
    228 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_1:
    229 ; AVX512BWVL:       # %bb.0:
    230 ; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %xmm0
    231 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    232 ; AVX512BWVL-NEXT:    retq
    233   %vec = load <16 x i8>, <16 x i8>* %L
    234   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
    235   store <4 x i8> %strided.vec, <4 x i8>* %S
    236   ret void
    237 }
    238 
    239 define void @shuffle_v16i8_to_v4i8_2(<16 x i8>* %L, <4 x i8>* %S) nounwind {
    240 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_2:
    241 ; SSE2:       # %bb.0:
    242 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    243 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    244 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    245 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
    246 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    247 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
    248 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    249 ; SSE2-NEXT:    movd %xmm0, (%rsi)
    250 ; SSE2-NEXT:    retq
    251 ;
    252 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_2:
    253 ; SSE42:       # %bb.0:
    254 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    255 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
    256 ; SSE42-NEXT:    movd %xmm0, (%rsi)
    257 ; SSE42-NEXT:    retq
    258 ;
    259 ; AVX-LABEL: shuffle_v16i8_to_v4i8_2:
    260 ; AVX:       # %bb.0:
    261 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    262 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
    263 ; AVX-NEXT:    vmovd %xmm0, (%rsi)
    264 ; AVX-NEXT:    retq
    265 ;
    266 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_2:
    267 ; AVX512F:       # %bb.0:
    268 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    269 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
    270 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    271 ; AVX512F-NEXT:    retq
    272 ;
    273 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_2:
    274 ; AVX512VL:       # %bb.0:
    275 ; AVX512VL-NEXT:    vpsrld $16, (%rdi), %xmm0
    276 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    277 ; AVX512VL-NEXT:    retq
    278 ;
    279 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_2:
    280 ; AVX512BW:       # %bb.0:
    281 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    282 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
    283 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    284 ; AVX512BW-NEXT:    retq
    285 ;
    286 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_2:
    287 ; AVX512BWVL:       # %bb.0:
    288 ; AVX512BWVL-NEXT:    vpsrld $16, (%rdi), %xmm0
    289 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    290 ; AVX512BWVL-NEXT:    retq
    291   %vec = load <16 x i8>, <16 x i8>* %L
    292   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
    293   store <4 x i8> %strided.vec, <4 x i8>* %S
    294   ret void
    295 }
    296 
    297 define void @shuffle_v16i8_to_v4i8_3(<16 x i8>* %L, <4 x i8>* %S) nounwind {
    298 ; SSE2-LABEL: shuffle_v16i8_to_v4i8_3:
    299 ; SSE2:       # %bb.0:
    300 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    301 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    302 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    303 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    304 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
    305 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
    306 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    307 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    308 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    309 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    310 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    311 ; SSE2-NEXT:    movd %xmm0, (%rsi)
    312 ; SSE2-NEXT:    retq
    313 ;
    314 ; SSE42-LABEL: shuffle_v16i8_to_v4i8_3:
    315 ; SSE42:       # %bb.0:
    316 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    317 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
    318 ; SSE42-NEXT:    movd %xmm0, (%rsi)
    319 ; SSE42-NEXT:    retq
    320 ;
    321 ; AVX-LABEL: shuffle_v16i8_to_v4i8_3:
    322 ; AVX:       # %bb.0:
    323 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    324 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
    325 ; AVX-NEXT:    vmovd %xmm0, (%rsi)
    326 ; AVX-NEXT:    retq
    327 ;
    328 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8_3:
    329 ; AVX512F:       # %bb.0:
    330 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    331 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
    332 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    333 ; AVX512F-NEXT:    retq
    334 ;
    335 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_3:
    336 ; AVX512VL:       # %bb.0:
    337 ; AVX512VL-NEXT:    vpsrld $24, (%rdi), %xmm0
    338 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    339 ; AVX512VL-NEXT:    retq
    340 ;
    341 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_3:
    342 ; AVX512BW:       # %bb.0:
    343 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    344 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
    345 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    346 ; AVX512BW-NEXT:    retq
    347 ;
    348 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_3:
    349 ; AVX512BWVL:       # %bb.0:
    350 ; AVX512BWVL-NEXT:    vpsrld $24, (%rdi), %xmm0
    351 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    352 ; AVX512BWVL-NEXT:    retq
    353   %vec = load <16 x i8>, <16 x i8>* %L
    354   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
    355   store <4 x i8> %strided.vec, <4 x i8>* %S
    356   ret void
    357 }
    358 
    359 define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
    360 ; SSE-LABEL: shuffle_v8i16_to_v2i16_1:
    361 ; SSE:       # %bb.0:
    362 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    363 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    364 ; SSE-NEXT:    movd %xmm0, (%rsi)
    365 ; SSE-NEXT:    retq
    366 ;
    367 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
    368 ; AVX1:       # %bb.0:
    369 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    370 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    371 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    372 ; AVX1-NEXT:    retq
    373 ;
    374 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
    375 ; AVX2-SLOW:       # %bb.0:
    376 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    377 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    378 ; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
    379 ; AVX2-SLOW-NEXT:    retq
    380 ;
    381 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
    382 ; AVX2-FAST:       # %bb.0:
    383 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
    384 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
    385 ; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
    386 ; AVX2-FAST-NEXT:    retq
    387 ;
    388 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
    389 ; AVX512F:       # %bb.0:
    390 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    391 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    392 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    393 ; AVX512F-NEXT:    retq
    394 ;
    395 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
    396 ; AVX512VL:       # %bb.0:
    397 ; AVX512VL-NEXT:    vpsrld $16, (%rdi), %xmm0
    398 ; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
    399 ; AVX512VL-NEXT:    retq
    400 ;
    401 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
    402 ; AVX512BW:       # %bb.0:
    403 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    404 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
    405 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    406 ; AVX512BW-NEXT:    retq
    407 ;
    408 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
    409 ; AVX512BWVL:       # %bb.0:
    410 ; AVX512BWVL-NEXT:    vpsrld $16, (%rdi), %xmm0
    411 ; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
    412 ; AVX512BWVL-NEXT:    retq
    413   %vec = load <8 x i16>, <8 x i16>* %L
    414   %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
    415   store <2 x i16> %strided.vec, <2 x i16>* %S
    416   ret void
    417 }
    418 
    419 define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
    420 ; SSE-LABEL: shuffle_v8i16_to_v2i16_2:
    421 ; SSE:       # %bb.0:
    422 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    423 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    424 ; SSE-NEXT:    movd %xmm0, (%rsi)
    425 ; SSE-NEXT:    retq
    426 ;
    427 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
    428 ; AVX1:       # %bb.0:
    429 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    430 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    431 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    432 ; AVX1-NEXT:    retq
    433 ;
    434 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
    435 ; AVX2-SLOW:       # %bb.0:
    436 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    437 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    438 ; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
    439 ; AVX2-SLOW-NEXT:    retq
    440 ;
    441 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
    442 ; AVX2-FAST:       # %bb.0:
    443 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
    444 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
    445 ; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
    446 ; AVX2-FAST-NEXT:    retq
    447 ;
    448 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
    449 ; AVX512F:       # %bb.0:
    450 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    451 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    452 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    453 ; AVX512F-NEXT:    retq
    454 ;
    455 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
    456 ; AVX512VL:       # %bb.0:
    457 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
    458 ; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
    459 ; AVX512VL-NEXT:    retq
    460 ;
    461 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
    462 ; AVX512BW:       # %bb.0:
    463 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    464 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
    465 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    466 ; AVX512BW-NEXT:    retq
    467 ;
    468 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
    469 ; AVX512BWVL:       # %bb.0:
    470 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
    471 ; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
    472 ; AVX512BWVL-NEXT:    retq
    473   %vec = load <8 x i16>, <8 x i16>* %L
    474   %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
    475   store <2 x i16> %strided.vec, <2 x i16>* %S
    476   ret void
    477 }
    478 
    479 define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
    480 ; SSE-LABEL: shuffle_v8i16_to_v2i16_3:
    481 ; SSE:       # %bb.0:
    482 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    483 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    484 ; SSE-NEXT:    movd %xmm0, (%rsi)
    485 ; SSE-NEXT:    retq
    486 ;
    487 ; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
    488 ; AVX1:       # %bb.0:
    489 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    490 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    491 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    492 ; AVX1-NEXT:    retq
    493 ;
    494 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
    495 ; AVX2-SLOW:       # %bb.0:
    496 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    497 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    498 ; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
    499 ; AVX2-SLOW-NEXT:    retq
    500 ;
    501 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
    502 ; AVX2-FAST:       # %bb.0:
    503 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
    504 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
    505 ; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
    506 ; AVX2-FAST-NEXT:    retq
    507 ;
    508 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
    509 ; AVX512F:       # %bb.0:
    510 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
    511 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    512 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    513 ; AVX512F-NEXT:    retq
    514 ;
    515 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
    516 ; AVX512VL:       # %bb.0:
    517 ; AVX512VL-NEXT:    vpsrlq $48, (%rdi), %xmm0
    518 ; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
    519 ; AVX512VL-NEXT:    retq
    520 ;
    521 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
    522 ; AVX512BW:       # %bb.0:
    523 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    524 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
    525 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    526 ; AVX512BW-NEXT:    retq
    527 ;
    528 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
    529 ; AVX512BWVL:       # %bb.0:
    530 ; AVX512BWVL-NEXT:    vpsrlq $48, (%rdi), %xmm0
    531 ; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
    532 ; AVX512BWVL-NEXT:    retq
    533   %vec = load <8 x i16>, <8 x i16>* %L
    534   %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
    535   store <2 x i16> %strided.vec, <2 x i16>* %S
    536   ret void
    537 }
    538 
    539 define void @shuffle_v16i8_to_v2i8_1(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    540 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_1:
    541 ; SSE2:       # %bb.0:
    542 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    543 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    544 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    545 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    546 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    547 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    548 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    549 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    550 ; SSE2-NEXT:    movd %xmm0, %eax
    551 ; SSE2-NEXT:    movw %ax, (%rsi)
    552 ; SSE2-NEXT:    retq
    553 ;
    554 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_1:
    555 ; SSE42:       # %bb.0:
    556 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    557 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    558 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    559 ; SSE42-NEXT:    retq
    560 ;
    561 ; AVX-LABEL: shuffle_v16i8_to_v2i8_1:
    562 ; AVX:       # %bb.0:
    563 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    564 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    565 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    566 ; AVX-NEXT:    retq
    567 ;
    568 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_1:
    569 ; AVX512F:       # %bb.0:
    570 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    571 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    572 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    573 ; AVX512F-NEXT:    retq
    574 ;
    575 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_1:
    576 ; AVX512VL:       # %bb.0:
    577 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    578 ; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
    579 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    580 ; AVX512VL-NEXT:    retq
    581 ;
    582 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_1:
    583 ; AVX512BW:       # %bb.0:
    584 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    585 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    586 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    587 ; AVX512BW-NEXT:    retq
    588 ;
    589 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_1:
    590 ; AVX512BWVL:       # %bb.0:
    591 ; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %xmm0
    592 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    593 ; AVX512BWVL-NEXT:    retq
    594   %vec = load <16 x i8>, <16 x i8>* %L
    595   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
    596   store <2 x i8> %strided.vec, <2 x i8>* %S
    597   ret void
    598 }
    599 
    600 define void @shuffle_v16i8_to_v2i8_2(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    601 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
    602 ; SSE2:       # %bb.0:
    603 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    604 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    605 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    606 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    607 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    608 ; SSE2-NEXT:    movd %xmm0, %eax
    609 ; SSE2-NEXT:    movw %ax, (%rsi)
    610 ; SSE2-NEXT:    retq
    611 ;
    612 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_2:
    613 ; SSE42:       # %bb.0:
    614 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    615 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    616 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    617 ; SSE42-NEXT:    retq
    618 ;
    619 ; AVX-LABEL: shuffle_v16i8_to_v2i8_2:
    620 ; AVX:       # %bb.0:
    621 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    622 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    623 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    624 ; AVX-NEXT:    retq
    625 ;
    626 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_2:
    627 ; AVX512F:       # %bb.0:
    628 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    629 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    630 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    631 ; AVX512F-NEXT:    retq
    632 ;
    633 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_2:
    634 ; AVX512VL:       # %bb.0:
    635 ; AVX512VL-NEXT:    vpsrld $16, (%rdi), %xmm0
    636 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    637 ; AVX512VL-NEXT:    retq
    638 ;
    639 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_2:
    640 ; AVX512BW:       # %bb.0:
    641 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    642 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    643 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    644 ; AVX512BW-NEXT:    retq
    645 ;
    646 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_2:
    647 ; AVX512BWVL:       # %bb.0:
    648 ; AVX512BWVL-NEXT:    vpsrld $16, (%rdi), %xmm0
    649 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    650 ; AVX512BWVL-NEXT:    retq
    651   %vec = load <16 x i8>, <16 x i8>* %L
    652   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
    653   store <2 x i8> %strided.vec, <2 x i8>* %S
    654   ret void
    655 }
    656 
    657 define void @shuffle_v16i8_to_v2i8_3(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    658 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_3:
    659 ; SSE2:       # %bb.0:
    660 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    661 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    662 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    663 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    664 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    665 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    666 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    667 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    668 ; SSE2-NEXT:    movd %xmm0, %eax
    669 ; SSE2-NEXT:    movw %ax, (%rsi)
    670 ; SSE2-NEXT:    retq
    671 ;
    672 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_3:
    673 ; SSE42:       # %bb.0:
    674 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    675 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    676 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    677 ; SSE42-NEXT:    retq
    678 ;
    679 ; AVX-LABEL: shuffle_v16i8_to_v2i8_3:
    680 ; AVX:       # %bb.0:
    681 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    682 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    683 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    684 ; AVX-NEXT:    retq
    685 ;
    686 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_3:
    687 ; AVX512F:       # %bb.0:
    688 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    689 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    690 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    691 ; AVX512F-NEXT:    retq
    692 ;
    693 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_3:
    694 ; AVX512VL:       # %bb.0:
    695 ; AVX512VL-NEXT:    vpsrld $24, (%rdi), %xmm0
    696 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    697 ; AVX512VL-NEXT:    retq
    698 ;
    699 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_3:
    700 ; AVX512BW:       # %bb.0:
    701 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    702 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    703 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    704 ; AVX512BW-NEXT:    retq
    705 ;
    706 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_3:
    707 ; AVX512BWVL:       # %bb.0:
    708 ; AVX512BWVL-NEXT:    vpsrld $24, (%rdi), %xmm0
    709 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    710 ; AVX512BWVL-NEXT:    retq
    711   %vec = load <16 x i8>, <16 x i8>* %L
    712   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
    713   store <2 x i8> %strided.vec, <2 x i8>* %S
    714   ret void
    715 }
    716 
    717 define void @shuffle_v16i8_to_v2i8_4(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    718 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
    719 ; SSE2:       # %bb.0:
    720 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    721 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    722 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    723 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    724 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    725 ; SSE2-NEXT:    movd %xmm0, %eax
    726 ; SSE2-NEXT:    movw %ax, (%rsi)
    727 ; SSE2-NEXT:    retq
    728 ;
    729 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_4:
    730 ; SSE42:       # %bb.0:
    731 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    732 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    733 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    734 ; SSE42-NEXT:    retq
    735 ;
    736 ; AVX-LABEL: shuffle_v16i8_to_v2i8_4:
    737 ; AVX:       # %bb.0:
    738 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    739 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    740 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    741 ; AVX-NEXT:    retq
    742 ;
    743 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_4:
    744 ; AVX512F:       # %bb.0:
    745 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    746 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    747 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    748 ; AVX512F-NEXT:    retq
    749 ;
    750 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_4:
    751 ; AVX512VL:       # %bb.0:
    752 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
    753 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    754 ; AVX512VL-NEXT:    retq
    755 ;
    756 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_4:
    757 ; AVX512BW:       # %bb.0:
    758 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    759 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    760 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    761 ; AVX512BW-NEXT:    retq
    762 ;
    763 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_4:
    764 ; AVX512BWVL:       # %bb.0:
    765 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
    766 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    767 ; AVX512BWVL-NEXT:    retq
    768   %vec = load <16 x i8>, <16 x i8>* %L
    769   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
    770   store <2 x i8> %strided.vec, <2 x i8>* %S
    771   ret void
    772 }
    773 
    774 define void @shuffle_v16i8_to_v2i8_5(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    775 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_5:
    776 ; SSE2:       # %bb.0:
    777 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    778 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    779 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    780 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    781 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    782 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    783 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    784 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    785 ; SSE2-NEXT:    movd %xmm0, %eax
    786 ; SSE2-NEXT:    movw %ax, (%rsi)
    787 ; SSE2-NEXT:    retq
    788 ;
    789 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_5:
    790 ; SSE42:       # %bb.0:
    791 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    792 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    793 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    794 ; SSE42-NEXT:    retq
    795 ;
    796 ; AVX-LABEL: shuffle_v16i8_to_v2i8_5:
    797 ; AVX:       # %bb.0:
    798 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    799 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    800 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    801 ; AVX-NEXT:    retq
    802 ;
    803 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_5:
    804 ; AVX512F:       # %bb.0:
    805 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    806 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    807 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    808 ; AVX512F-NEXT:    retq
    809 ;
    810 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_5:
    811 ; AVX512VL:       # %bb.0:
    812 ; AVX512VL-NEXT:    vpsrlq $40, (%rdi), %xmm0
    813 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    814 ; AVX512VL-NEXT:    retq
    815 ;
    816 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_5:
    817 ; AVX512BW:       # %bb.0:
    818 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    819 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    820 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    821 ; AVX512BW-NEXT:    retq
    822 ;
    823 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_5:
    824 ; AVX512BWVL:       # %bb.0:
    825 ; AVX512BWVL-NEXT:    vpsrlq $40, (%rdi), %xmm0
    826 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    827 ; AVX512BWVL-NEXT:    retq
    828   %vec = load <16 x i8>, <16 x i8>* %L
    829   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
    830   store <2 x i8> %strided.vec, <2 x i8>* %S
    831   ret void
    832 }
    833 
    834 define void @shuffle_v16i8_to_v2i8_6(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    835 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
    836 ; SSE2:       # %bb.0:
    837 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    838 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    839 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    840 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    841 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    842 ; SSE2-NEXT:    movd %xmm0, %eax
    843 ; SSE2-NEXT:    movw %ax, (%rsi)
    844 ; SSE2-NEXT:    retq
    845 ;
    846 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_6:
    847 ; SSE42:       # %bb.0:
    848 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    849 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    850 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    851 ; SSE42-NEXT:    retq
    852 ;
    853 ; AVX-LABEL: shuffle_v16i8_to_v2i8_6:
    854 ; AVX:       # %bb.0:
    855 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    856 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    857 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    858 ; AVX-NEXT:    retq
    859 ;
    860 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_6:
    861 ; AVX512F:       # %bb.0:
    862 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    863 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    864 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    865 ; AVX512F-NEXT:    retq
    866 ;
    867 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_6:
    868 ; AVX512VL:       # %bb.0:
    869 ; AVX512VL-NEXT:    vpsrlq $48, (%rdi), %xmm0
    870 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    871 ; AVX512VL-NEXT:    retq
    872 ;
    873 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_6:
    874 ; AVX512BW:       # %bb.0:
    875 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    876 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    877 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    878 ; AVX512BW-NEXT:    retq
    879 ;
    880 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_6:
    881 ; AVX512BWVL:       # %bb.0:
    882 ; AVX512BWVL-NEXT:    vpsrlq $48, (%rdi), %xmm0
    883 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    884 ; AVX512BWVL-NEXT:    retq
    885   %vec = load <16 x i8>, <16 x i8>* %L
    886   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
    887   store <2 x i8> %strided.vec, <2 x i8>* %S
    888   ret void
    889 }
    890 
    891 define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    892 ; SSE2-LABEL: shuffle_v16i8_to_v2i8_7:
    893 ; SSE2:       # %bb.0:
    894 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    895 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    896 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    897 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    898 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    899 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    900 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    901 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    902 ; SSE2-NEXT:    movd %xmm0, %eax
    903 ; SSE2-NEXT:    movw %ax, (%rsi)
    904 ; SSE2-NEXT:    retq
    905 ;
    906 ; SSE42-LABEL: shuffle_v16i8_to_v2i8_7:
    907 ; SSE42:       # %bb.0:
    908 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    909 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    910 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    911 ; SSE42-NEXT:    retq
    912 ;
    913 ; AVX-LABEL: shuffle_v16i8_to_v2i8_7:
    914 ; AVX:       # %bb.0:
    915 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    916 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    917 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    918 ; AVX-NEXT:    retq
    919 ;
    920 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8_7:
    921 ; AVX512F:       # %bb.0:
    922 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    923 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    924 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    925 ; AVX512F-NEXT:    retq
    926 ;
    927 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_7:
    928 ; AVX512VL:       # %bb.0:
    929 ; AVX512VL-NEXT:    vpsrlq $56, (%rdi), %xmm0
    930 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    931 ; AVX512VL-NEXT:    retq
    932 ;
    933 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_7:
    934 ; AVX512BW:       # %bb.0:
    935 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    936 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    937 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    938 ; AVX512BW-NEXT:    retq
    939 ;
    940 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_7:
    941 ; AVX512BWVL:       # %bb.0:
    942 ; AVX512BWVL-NEXT:    vpsrlq $56, (%rdi), %xmm0
    943 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    944 ; AVX512BWVL-NEXT:    retq
    945   %vec = load <16 x i8>, <16 x i8>* %L
    946   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
    947   store <2 x i8> %strided.vec, <2 x i8>* %S
    948   ret void
    949 }
    950 
    951