Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
      9 
     10 define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
     11 ; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
     12 ; AVX1:       # %bb.0:
     13 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
     14 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     15 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
     16 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     17 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     18 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     19 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
     20 ; AVX1-NEXT:    vzeroupper
     21 ; AVX1-NEXT:    retq
     22 ;
     23 ; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
     24 ; AVX2:       # %bb.0:
     25 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
     26 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
     27 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
     28 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     29 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     30 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     31 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
     32 ; AVX2-NEXT:    vzeroupper
     33 ; AVX2-NEXT:    retq
     34 ;
     35 ; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
     36 ; AVX512:       # %bb.0:
     37 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
     38 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
     39 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
     40 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     41 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     42 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     43 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
     44 ; AVX512-NEXT:    vzeroupper
     45 ; AVX512-NEXT:    retq
     46   %vec = load <32 x i8>, <32 x i8>* %L
     47   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
     48   store <16 x i8> %strided.vec, <16 x i8>* %S
     49   ret void
     50 }
     51 
     52 define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
     53 ; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
     54 ; AVX1:       # %bb.0:
     55 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
     56 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     57 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
     58 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     59 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     60 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     61 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
     62 ; AVX1-NEXT:    vzeroupper
     63 ; AVX1-NEXT:    retq
     64 ;
     65 ; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
     66 ; AVX2:       # %bb.0:
     67 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
     68 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
     69 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
     70 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     71 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     72 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     73 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
     74 ; AVX2-NEXT:    vzeroupper
     75 ; AVX2-NEXT:    retq
     76 ;
     77 ; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
     78 ; AVX512:       # %bb.0:
     79 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
     80 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
     81 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
     82 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     83 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     84 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     85 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
     86 ; AVX512-NEXT:    vzeroupper
     87 ; AVX512-NEXT:    retq
     88   %vec = load <16 x i16>, <16 x i16>* %L
     89   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
     90   store <8 x i16> %strided.vec, <8 x i16>* %S
     91   ret void
     92 }
     93 
     94 define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
     95 ; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
     96 ; AVX:       # %bb.0:
     97 ; AVX-NEXT:    vmovaps (%rdi), %ymm0
     98 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
     99 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    100 ; AVX-NEXT:    vmovaps %xmm0, (%rsi)
    101 ; AVX-NEXT:    vzeroupper
    102 ; AVX-NEXT:    retq
    103 ;
    104 ; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
    105 ; AVX512:       # %bb.0:
    106 ; AVX512-NEXT:    vmovaps (%rdi), %ymm0
    107 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    108 ; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    109 ; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
    110 ; AVX512-NEXT:    vzeroupper
    111 ; AVX512-NEXT:    retq
    112   %vec = load <8 x i32>, <8 x i32>* %L
    113   %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
    114   store <4 x i32> %strided.vec, <4 x i32>* %S
    115   ret void
    116 }
    117 
    118 define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
    119 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
    120 ; AVX1:       # %bb.0:
    121 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    122 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    123 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    124 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    125 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    126 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    127 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    128 ; AVX1-NEXT:    vzeroupper
    129 ; AVX1-NEXT:    retq
    130 ;
    131 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
    132 ; AVX2:       # %bb.0:
    133 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    134 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    135 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    136 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    137 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    138 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    139 ; AVX2-NEXT:    vmovq %xmm0, (%rsi)
    140 ; AVX2-NEXT:    vzeroupper
    141 ; AVX2-NEXT:    retq
    142 ;
    143 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
    144 ; AVX512F:       # %bb.0:
    145 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    146 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    147 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    148 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    149 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    150 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    151 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    152 ; AVX512F-NEXT:    vzeroupper
    153 ; AVX512F-NEXT:    retq
    154 ;
    155 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
    156 ; AVX512VL:       # %bb.0:
    157 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    158 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    159 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    160 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    161 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    162 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    163 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    164 ; AVX512VL-NEXT:    vzeroupper
    165 ; AVX512VL-NEXT:    retq
    166 ;
    167 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
    168 ; AVX512BW:       # %bb.0:
    169 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    170 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    171 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    172 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    173 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    174 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    175 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    176 ; AVX512BW-NEXT:    vzeroupper
    177 ; AVX512BW-NEXT:    retq
    178 ;
    179 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
    180 ; AVX512BWVL:       # %bb.0:
    181 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    182 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    183 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
    184 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    185 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    186 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    187 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
    188 ; AVX512BWVL-NEXT:    vzeroupper
    189 ; AVX512BWVL-NEXT:    retq
    190   %vec = load <32 x i8>, <32 x i8>* %L
    191   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
    192   store <8 x i8> %strided.vec, <8 x i8>* %S
    193   ret void
    194 }
    195 
    196 define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
    197 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
    198 ; AVX1:       # %bb.0:
    199 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    200 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    201 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    202 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    203 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    204 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    205 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    206 ; AVX1-NEXT:    vzeroupper
    207 ; AVX1-NEXT:    retq
    208 ;
    209 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
    210 ; AVX2:       # %bb.0:
    211 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    212 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    213 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    214 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    215 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    216 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    217 ; AVX2-NEXT:    vmovq %xmm0, (%rsi)
    218 ; AVX2-NEXT:    vzeroupper
    219 ; AVX2-NEXT:    retq
    220 ;
    221 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
    222 ; AVX512F:       # %bb.0:
    223 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    224 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    225 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    226 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    227 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    228 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    229 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    230 ; AVX512F-NEXT:    vzeroupper
    231 ; AVX512F-NEXT:    retq
    232 ;
    233 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
    234 ; AVX512VL:       # %bb.0:
    235 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    236 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    237 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    238 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    239 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    240 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    241 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    242 ; AVX512VL-NEXT:    vzeroupper
    243 ; AVX512VL-NEXT:    retq
    244 ;
    245 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
    246 ; AVX512BW:       # %bb.0:
    247 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    248 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    249 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    250 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    251 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    252 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    253 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    254 ; AVX512BW-NEXT:    vzeroupper
    255 ; AVX512BW-NEXT:    retq
    256 ;
    257 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
    258 ; AVX512BWVL:       # %bb.0:
    259 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    260 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    261 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
    262 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    263 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    264 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    265 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
    266 ; AVX512BWVL-NEXT:    vzeroupper
    267 ; AVX512BWVL-NEXT:    retq
    268   %vec = load <32 x i8>, <32 x i8>* %L
    269   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
    270   store <8 x i8> %strided.vec, <8 x i8>* %S
    271   ret void
    272 }
    273 
    274 define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
    275 ; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
    276 ; AVX1:       # %bb.0:
    277 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    278 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    279 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    280 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    281 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    282 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    283 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    284 ; AVX1-NEXT:    vzeroupper
    285 ; AVX1-NEXT:    retq
    286 ;
    287 ; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
    288 ; AVX2:       # %bb.0:
    289 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    290 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    291 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    292 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    293 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    294 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    295 ; AVX2-NEXT:    vmovq %xmm0, (%rsi)
    296 ; AVX2-NEXT:    vzeroupper
    297 ; AVX2-NEXT:    retq
    298 ;
    299 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
    300 ; AVX512F:       # %bb.0:
    301 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    302 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    303 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    304 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    305 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    306 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    307 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    308 ; AVX512F-NEXT:    vzeroupper
    309 ; AVX512F-NEXT:    retq
    310 ;
    311 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
    312 ; AVX512VL:       # %bb.0:
    313 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    314 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    315 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    316 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    317 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    318 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    319 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    320 ; AVX512VL-NEXT:    vzeroupper
    321 ; AVX512VL-NEXT:    retq
    322 ;
    323 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
    324 ; AVX512BW:       # %bb.0:
    325 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    326 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    327 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    328 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    329 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    330 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    331 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    332 ; AVX512BW-NEXT:    vzeroupper
    333 ; AVX512BW-NEXT:    retq
    334 ;
    335 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
    336 ; AVX512BWVL:       # %bb.0:
    337 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    338 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    339 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
    340 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    341 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    342 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    343 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
    344 ; AVX512BWVL-NEXT:    vzeroupper
    345 ; AVX512BWVL-NEXT:    retq
    346   %vec = load <32 x i8>, <32 x i8>* %L
    347   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
    348   store <8 x i8> %strided.vec, <8 x i8>* %S
    349   ret void
    350 }
    351 
    352 define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
    353 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
    354 ; AVX1:       # %bb.0:
    355 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    356 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    357 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    358 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
    359 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    360 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    361 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    362 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    363 ; AVX1-NEXT:    vzeroupper
    364 ; AVX1-NEXT:    retq
    365 ;
    366 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
    367 ; AVX2-SLOW:       # %bb.0:
    368 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
    369 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    370 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    371 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
    372 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    373 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    374 ; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    375 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
    376 ; AVX2-SLOW-NEXT:    vzeroupper
    377 ; AVX2-SLOW-NEXT:    retq
    378 ;
    379 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
    380 ; AVX2-FAST:       # %bb.0:
    381 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
    382 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
    383 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
    384 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    385 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    386 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    387 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
    388 ; AVX2-FAST-NEXT:    vzeroupper
    389 ; AVX2-FAST-NEXT:    retq
    390 ;
    391 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
    392 ; AVX512F:       # %bb.0:
    393 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    394 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    395 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    396 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
    397 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    398 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    399 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    400 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    401 ; AVX512F-NEXT:    vzeroupper
    402 ; AVX512F-NEXT:    retq
    403 ;
    404 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
    405 ; AVX512VL:       # %bb.0:
    406 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    407 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    408 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
    409 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    410 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    411 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    412 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
    413 ; AVX512VL-NEXT:    vzeroupper
    414 ; AVX512VL-NEXT:    retq
    415 ;
    416 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
    417 ; AVX512BW:       # %bb.0:
    418 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    419 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    420 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
    421 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    422 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    423 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    424 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    425 ; AVX512BW-NEXT:    vzeroupper
    426 ; AVX512BW-NEXT:    retq
    427 ;
    428 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
    429 ; AVX512BWVL:       # %bb.0:
    430 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    431 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    432 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
    433 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    434 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    435 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    436 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
    437 ; AVX512BWVL-NEXT:    vzeroupper
    438 ; AVX512BWVL-NEXT:    retq
    439   %vec = load <16 x i16>, <16 x i16>* %L
    440   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
    441   store <4 x i16> %strided.vec, <4 x i16>* %S
    442   ret void
    443 }
    444 
    445 define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
    446 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
    447 ; AVX1:       # %bb.0:
    448 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    449 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    450 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    451 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
    452 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    453 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    454 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    455 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    456 ; AVX1-NEXT:    vzeroupper
    457 ; AVX1-NEXT:    retq
    458 ;
    459 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
    460 ; AVX2-SLOW:       # %bb.0:
    461 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
    462 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    463 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    464 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
    465 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    466 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    467 ; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    468 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
    469 ; AVX2-SLOW-NEXT:    vzeroupper
    470 ; AVX2-SLOW-NEXT:    retq
    471 ;
    472 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
    473 ; AVX2-FAST:       # %bb.0:
    474 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
    475 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
    476 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
    477 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    478 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    479 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    480 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
    481 ; AVX2-FAST-NEXT:    vzeroupper
    482 ; AVX2-FAST-NEXT:    retq
    483 ;
    484 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
    485 ; AVX512F:       # %bb.0:
    486 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    487 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    488 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    489 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
    490 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    491 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    492 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    493 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    494 ; AVX512F-NEXT:    vzeroupper
    495 ; AVX512F-NEXT:    retq
    496 ;
    497 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
    498 ; AVX512VL:       # %bb.0:
    499 ; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
    500 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
    501 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    502 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
    503 ; AVX512VL-NEXT:    vzeroupper
    504 ; AVX512VL-NEXT:    retq
    505 ;
    506 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
    507 ; AVX512BW:       # %bb.0:
    508 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    509 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    510 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
    511 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    512 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    513 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    514 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    515 ; AVX512BW-NEXT:    vzeroupper
    516 ; AVX512BW-NEXT:    retq
    517 ;
    518 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
    519 ; AVX512BWVL:       # %bb.0:
    520 ; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
    521 ; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
    522 ; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    523 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
    524 ; AVX512BWVL-NEXT:    vzeroupper
    525 ; AVX512BWVL-NEXT:    retq
    526   %vec = load <16 x i16>, <16 x i16>* %L
    527   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
    528   store <4 x i16> %strided.vec, <4 x i16>* %S
    529   ret void
    530 }
    531 
    532 define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
    533 ; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
    534 ; AVX1:       # %bb.0:
    535 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    536 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    537 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    538 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
    539 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    540 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    541 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    542 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    543 ; AVX1-NEXT:    vzeroupper
    544 ; AVX1-NEXT:    retq
    545 ;
    546 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
    547 ; AVX2-SLOW:       # %bb.0:
    548 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
    549 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    550 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    551 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
    552 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    553 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    554 ; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    555 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
    556 ; AVX2-SLOW-NEXT:    vzeroupper
    557 ; AVX2-SLOW-NEXT:    retq
    558 ;
    559 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
    560 ; AVX2-FAST:       # %bb.0:
    561 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
    562 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
    563 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
    564 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    565 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    566 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    567 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
    568 ; AVX2-FAST-NEXT:    vzeroupper
    569 ; AVX2-FAST-NEXT:    retq
    570 ;
    571 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
    572 ; AVX512F:       # %bb.0:
    573 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    574 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    575 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    576 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
    577 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    578 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    579 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    580 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    581 ; AVX512F-NEXT:    vzeroupper
    582 ; AVX512F-NEXT:    retq
    583 ;
    584 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
    585 ; AVX512VL:       # %bb.0:
    586 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    587 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    588 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
    589 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    590 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    591 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    592 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
    593 ; AVX512VL-NEXT:    vzeroupper
    594 ; AVX512VL-NEXT:    retq
    595 ;
    596 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
    597 ; AVX512BW:       # %bb.0:
    598 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    599 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    600 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
    601 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    602 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    603 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    604 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    605 ; AVX512BW-NEXT:    vzeroupper
    606 ; AVX512BW-NEXT:    retq
    607 ;
    608 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
    609 ; AVX512BWVL:       # %bb.0:
    610 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    611 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    612 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
    613 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    614 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    615 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    616 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
    617 ; AVX512BWVL-NEXT:    vzeroupper
    618 ; AVX512BWVL-NEXT:    retq
    619   %vec = load <16 x i16>, <16 x i16>* %L
    620   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
    621   store <4 x i16> %strided.vec, <4 x i16>* %S
    622   ret void
    623 }
    624 
    625 define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
    626 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
    627 ; AVX1:       # %bb.0:
    628 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    629 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    630 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    631 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    632 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    633 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    634 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    635 ; AVX1-NEXT:    vzeroupper
    636 ; AVX1-NEXT:    retq
    637 ;
    638 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
    639 ; AVX2:       # %bb.0:
    640 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    641 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    642 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    643 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    644 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    645 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    646 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
    647 ; AVX2-NEXT:    vzeroupper
    648 ; AVX2-NEXT:    retq
    649 ;
    650 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
    651 ; AVX512F:       # %bb.0:
    652 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    653 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    654 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    655 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    656 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    657 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    658 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    659 ; AVX512F-NEXT:    vzeroupper
    660 ; AVX512F-NEXT:    retq
    661 ;
    662 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
    663 ; AVX512VL:       # %bb.0:
    664 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    665 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    666 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
    667 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    668 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    669 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    670 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    671 ; AVX512VL-NEXT:    vzeroupper
    672 ; AVX512VL-NEXT:    retq
    673 ;
    674 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
    675 ; AVX512BW:       # %bb.0:
    676 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    677 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    678 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    679 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    680 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    681 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    682 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    683 ; AVX512BW-NEXT:    vzeroupper
    684 ; AVX512BW-NEXT:    retq
    685 ;
    686 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
    687 ; AVX512BWVL:       # %bb.0:
    688 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    689 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    690 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]
    691 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    692 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    693 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    694 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    695 ; AVX512BWVL-NEXT:    vzeroupper
    696 ; AVX512BWVL-NEXT:    retq
    697   %vec = load <32 x i8>, <32 x i8>* %L
    698   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
    699   store <4 x i8> %strided.vec, <4 x i8>* %S
    700   ret void
    701 }
    702 
    703 define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
    704 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
    705 ; AVX1:       # %bb.0:
    706 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    707 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    708 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    709 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    710 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    711 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    712 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    713 ; AVX1-NEXT:    vzeroupper
    714 ; AVX1-NEXT:    retq
    715 ;
    716 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
    717 ; AVX2:       # %bb.0:
    718 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    719 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    720 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    721 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    722 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    723 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    724 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
    725 ; AVX2-NEXT:    vzeroupper
    726 ; AVX2-NEXT:    retq
    727 ;
    728 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
    729 ; AVX512F:       # %bb.0:
    730 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    731 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    732 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    733 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    734 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    735 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    736 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    737 ; AVX512F-NEXT:    vzeroupper
    738 ; AVX512F-NEXT:    retq
    739 ;
    740 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
    741 ; AVX512VL:       # %bb.0:
    742 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    743 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    744 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
    745 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    746 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    747 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    748 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    749 ; AVX512VL-NEXT:    vzeroupper
    750 ; AVX512VL-NEXT:    retq
    751 ;
    752 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
    753 ; AVX512BW:       # %bb.0:
    754 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    755 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    756 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    757 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    758 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    759 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    760 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    761 ; AVX512BW-NEXT:    vzeroupper
    762 ; AVX512BW-NEXT:    retq
    763 ;
    764 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
    765 ; AVX512BWVL:       # %bb.0:
    766 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    767 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    768 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
    769 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    770 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    771 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    772 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    773 ; AVX512BWVL-NEXT:    vzeroupper
    774 ; AVX512BWVL-NEXT:    retq
    775   %vec = load <32 x i8>, <32 x i8>* %L
    776   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
    777   store <4 x i8> %strided.vec, <4 x i8>* %S
    778   ret void
    779 }
    780 
    781 define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
    782 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
    783 ; AVX1:       # %bb.0:
    784 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    785 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    786 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    787 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    788 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    789 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    790 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    791 ; AVX1-NEXT:    vzeroupper
    792 ; AVX1-NEXT:    retq
    793 ;
    794 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
    795 ; AVX2:       # %bb.0:
    796 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    797 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    798 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    799 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    800 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    801 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    802 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
    803 ; AVX2-NEXT:    vzeroupper
    804 ; AVX2-NEXT:    retq
    805 ;
    806 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
    807 ; AVX512F:       # %bb.0:
    808 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    809 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    810 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    811 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    812 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    813 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    814 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    815 ; AVX512F-NEXT:    vzeroupper
    816 ; AVX512F-NEXT:    retq
    817 ;
    818 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
    819 ; AVX512VL:       # %bb.0:
    820 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    821 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    822 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
    823 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    824 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    825 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    826 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    827 ; AVX512VL-NEXT:    vzeroupper
    828 ; AVX512VL-NEXT:    retq
    829 ;
    830 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
    831 ; AVX512BW:       # %bb.0:
    832 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    833 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    834 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    835 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    836 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    837 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    838 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    839 ; AVX512BW-NEXT:    vzeroupper
    840 ; AVX512BW-NEXT:    retq
    841 ;
    842 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
    843 ; AVX512BWVL:       # %bb.0:
    844 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    845 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    846 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]
    847 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    848 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    849 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    850 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    851 ; AVX512BWVL-NEXT:    vzeroupper
    852 ; AVX512BWVL-NEXT:    retq
    853   %vec = load <32 x i8>, <32 x i8>* %L
    854   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
    855   store <4 x i8> %strided.vec, <4 x i8>* %S
    856   ret void
    857 }
    858 
    859 define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
    860 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
    861 ; AVX1:       # %bb.0:
    862 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    863 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    864 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    865 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    866 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    867 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    868 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    869 ; AVX1-NEXT:    vzeroupper
    870 ; AVX1-NEXT:    retq
    871 ;
    872 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
    873 ; AVX2:       # %bb.0:
    874 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    875 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    876 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    877 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    878 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    879 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    880 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
    881 ; AVX2-NEXT:    vzeroupper
    882 ; AVX2-NEXT:    retq
    883 ;
    884 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
    885 ; AVX512F:       # %bb.0:
    886 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    887 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    888 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    889 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    890 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    891 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    892 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    893 ; AVX512F-NEXT:    vzeroupper
    894 ; AVX512F-NEXT:    retq
    895 ;
    896 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
    897 ; AVX512VL:       # %bb.0:
    898 ; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
    899 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
    900 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    901 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    902 ; AVX512VL-NEXT:    vzeroupper
    903 ; AVX512VL-NEXT:    retq
    904 ;
    905 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
    906 ; AVX512BW:       # %bb.0:
    907 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    908 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    909 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    910 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    911 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    912 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    913 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    914 ; AVX512BW-NEXT:    vzeroupper
    915 ; AVX512BW-NEXT:    retq
    916 ;
    917 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
    918 ; AVX512BWVL:       # %bb.0:
    919 ; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
    920 ; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
    921 ; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    922 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    923 ; AVX512BWVL-NEXT:    vzeroupper
    924 ; AVX512BWVL-NEXT:    retq
    925   %vec = load <32 x i8>, <32 x i8>* %L
    926   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
    927   store <4 x i8> %strided.vec, <4 x i8>* %S
    928   ret void
    929 }
    930 
    931 define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
    932 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
    933 ; AVX1:       # %bb.0:
    934 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    935 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    936 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    937 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    938 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    939 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    940 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    941 ; AVX1-NEXT:    vzeroupper
    942 ; AVX1-NEXT:    retq
    943 ;
    944 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
    945 ; AVX2:       # %bb.0:
    946 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    947 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    948 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    949 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    950 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    951 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    952 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
    953 ; AVX2-NEXT:    vzeroupper
    954 ; AVX2-NEXT:    retq
    955 ;
    956 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
    957 ; AVX512F:       # %bb.0:
    958 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    959 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    960 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    961 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    962 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    963 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    964 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    965 ; AVX512F-NEXT:    vzeroupper
    966 ; AVX512F-NEXT:    retq
    967 ;
    968 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
    969 ; AVX512VL:       # %bb.0:
    970 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    971 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    972 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
    973 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    974 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    975 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    976 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    977 ; AVX512VL-NEXT:    vzeroupper
    978 ; AVX512VL-NEXT:    retq
    979 ;
    980 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
    981 ; AVX512BW:       # %bb.0:
    982 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    983 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    984 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    985 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    986 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    987 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    988 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    989 ; AVX512BW-NEXT:    vzeroupper
    990 ; AVX512BW-NEXT:    retq
    991 ;
    992 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
    993 ; AVX512BWVL:       # %bb.0:
    994 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    995 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    996 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7]
    997 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    998 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    999 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1000 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
   1001 ; AVX512BWVL-NEXT:    vzeroupper
   1002 ; AVX512BWVL-NEXT:    retq
   1003   %vec = load <32 x i8>, <32 x i8>* %L
   1004   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
   1005   store <4 x i8> %strided.vec, <4 x i8>* %S
   1006   ret void
   1007 }
   1008 
   1009 define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
   1010 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
   1011 ; AVX1:       # %bb.0:
   1012 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1013 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1014 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1015 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1016 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1017 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1018 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
   1019 ; AVX1-NEXT:    vzeroupper
   1020 ; AVX1-NEXT:    retq
   1021 ;
   1022 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
   1023 ; AVX2:       # %bb.0:
   1024 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1025 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1026 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1027 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1028 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1029 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1030 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
   1031 ; AVX2-NEXT:    vzeroupper
   1032 ; AVX2-NEXT:    retq
   1033 ;
   1034 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
   1035 ; AVX512F:       # %bb.0:
   1036 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1037 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1038 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1039 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1040 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1041 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1042 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
   1043 ; AVX512F-NEXT:    vzeroupper
   1044 ; AVX512F-NEXT:    retq
   1045 ;
   1046 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
   1047 ; AVX512VL:       # %bb.0:
   1048 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
   1049 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1050 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
   1051 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1052 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1053 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1054 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
   1055 ; AVX512VL-NEXT:    vzeroupper
   1056 ; AVX512VL-NEXT:    retq
   1057 ;
   1058 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
   1059 ; AVX512BW:       # %bb.0:
   1060 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
   1061 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1062 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1063 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1064 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1065 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1066 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
   1067 ; AVX512BW-NEXT:    vzeroupper
   1068 ; AVX512BW-NEXT:    retq
   1069 ;
   1070 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
   1071 ; AVX512BWVL:       # %bb.0:
   1072 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
   1073 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1074 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
   1075 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1076 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1077 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1078 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
   1079 ; AVX512BWVL-NEXT:    vzeroupper
   1080 ; AVX512BWVL-NEXT:    retq
   1081   %vec = load <32 x i8>, <32 x i8>* %L
   1082   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
   1083   store <4 x i8> %strided.vec, <4 x i8>* %S
   1084   ret void
   1085 }
   1086 
   1087 define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
   1088 ; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
   1089 ; AVX1:       # %bb.0:
   1090 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1091 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1092 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1093 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1094 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1095 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1096 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
   1097 ; AVX1-NEXT:    vzeroupper
   1098 ; AVX1-NEXT:    retq
   1099 ;
   1100 ; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
   1101 ; AVX2:       # %bb.0:
   1102 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1103 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1104 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1105 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1106 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1107 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1108 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
   1109 ; AVX2-NEXT:    vzeroupper
   1110 ; AVX2-NEXT:    retq
   1111 ;
   1112 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
   1113 ; AVX512F:       # %bb.0:
   1114 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1115 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1116 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1117 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1118 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1119 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1120 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
   1121 ; AVX512F-NEXT:    vzeroupper
   1122 ; AVX512F-NEXT:    retq
   1123 ;
   1124 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
   1125 ; AVX512VL:       # %bb.0:
   1126 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
   1127 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1128 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
   1129 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1130 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1131 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1132 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
   1133 ; AVX512VL-NEXT:    vzeroupper
   1134 ; AVX512VL-NEXT:    retq
   1135 ;
   1136 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
   1137 ; AVX512BW:       # %bb.0:
   1138 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
   1139 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1140 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1141 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1142 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1143 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1144 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
   1145 ; AVX512BW-NEXT:    vzeroupper
   1146 ; AVX512BW-NEXT:    retq
   1147 ;
   1148 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
   1149 ; AVX512BWVL:       # %bb.0:
   1150 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
   1151 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1152 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
   1153 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1154 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1155 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1156 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
   1157 ; AVX512BWVL-NEXT:    vzeroupper
   1158 ; AVX512BWVL-NEXT:    retq
   1159   %vec = load <32 x i8>, <32 x i8>* %L
   1160   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
   1161   store <4 x i8> %strided.vec, <4 x i8>* %S
   1162   ret void
   1163 }
   1164 
   1165