Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
      6 
      7 define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
      8 ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
      9 ; AVX512F:       # %bb.0:
     10 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
     11 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
     12 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
     13 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
     14 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
     15 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
     16 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
     17 ; AVX512F-NEXT:    vzeroupper
     18 ; AVX512F-NEXT:    retq
     19 ;
     20 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8_1:
     21 ; AVX512VL:       # %bb.0:
     22 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
     23 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
     24 ; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
     25 ; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
     26 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
     27 ; AVX512VL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
     28 ; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
     29 ; AVX512VL-NEXT:    vzeroupper
     30 ; AVX512VL-NEXT:    retq
     31 ;
     32 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
     33 ; AVX512BW:       # %bb.0:
     34 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
     35 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
     36 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
     37 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
     38 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
     39 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
     40 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
     41 ; AVX512BW-NEXT:    vzeroupper
     42 ; AVX512BW-NEXT:    retq
     43 ;
     44 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
     45 ; AVX512BWVL:       # %bb.0:
     46 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
     47 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
     48 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
     49 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
     50 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
     51 ; AVX512BWVL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
     52 ; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
     53 ; AVX512BWVL-NEXT:    vzeroupper
     54 ; AVX512BWVL-NEXT:    retq
     55   %vec = load <64 x i8>, <64 x i8>* %L
     56   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
     57   store <32 x i8> %strided.vec, <32 x i8>* %S
     58   ret void
     59 }
     60 
     61 define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind {
     62 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16_1:
     63 ; AVX512F:       # %bb.0:
     64 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
     65 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
     66 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
     67 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
     68 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
     69 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
     70 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
     71 ; AVX512F-NEXT:    vzeroupper
     72 ; AVX512F-NEXT:    retq
     73 ;
     74 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16_1:
     75 ; AVX512VL:       # %bb.0:
     76 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
     77 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
     78 ; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
     79 ; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
     80 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
     81 ; AVX512VL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
     82 ; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
     83 ; AVX512VL-NEXT:    vzeroupper
     84 ; AVX512VL-NEXT:    retq
     85 ;
     86 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
     87 ; AVX512BW:       # %bb.0:
     88 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
     89 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
     90 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
     91 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
     92 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
     93 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
     94 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
     95 ; AVX512BW-NEXT:    vzeroupper
     96 ; AVX512BW-NEXT:    retq
     97 ;
     98 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
     99 ; AVX512BWVL:       # %bb.0:
    100 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    101 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    102 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
    103 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
    104 ; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
    105 ; AVX512BWVL-NEXT:    vzeroupper
    106 ; AVX512BWVL-NEXT:    retq
    107   %vec = load <32 x i16>, <32 x i16>* %L
    108   %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
    109   store <16 x i16> %strided.vec, <16 x i16>* %S
    110   ret void
    111 }
    112 
    113 define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
    114 ; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
    115 ; AVX512F:       # %bb.0:
    116 ; AVX512F-NEXT:    vmovaps (%rdi), %zmm0
    117 ; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    118 ; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
    119 ; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
    120 ; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
    121 ; AVX512F-NEXT:    vzeroupper
    122 ; AVX512F-NEXT:    retq
    123 ;
    124 ; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
    125 ; AVX512VL:       # %bb.0:
    126 ; AVX512VL-NEXT:    vmovdqa64 (%rdi), %zmm0
    127 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    128 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
    129 ; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
    130 ; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
    131 ; AVX512VL-NEXT:    vzeroupper
    132 ; AVX512VL-NEXT:    retq
    133 ;
    134 ; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
    135 ; AVX512BW:       # %bb.0:
    136 ; AVX512BW-NEXT:    vmovaps (%rdi), %zmm0
    137 ; AVX512BW-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    138 ; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
    139 ; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
    140 ; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
    141 ; AVX512BW-NEXT:    vzeroupper
    142 ; AVX512BW-NEXT:    retq
    143 ;
    144 ; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
    145 ; AVX512BWVL:       # %bb.0:
    146 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    147 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    148 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
    149 ; AVX512BWVL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
    150 ; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
    151 ; AVX512BWVL-NEXT:    vzeroupper
    152 ; AVX512BWVL-NEXT:    retq
    153   %vec = load <16 x i32>, <16 x i32>* %L
    154   %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    155   store <8 x i32> %strided.vec, <8 x i32>* %S
    156   ret void
    157 }
    158 
    159 define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
    160 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8_1:
    161 ; AVX512F:       # %bb.0:
    162 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    163 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    164 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    165 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    166 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    167 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    168 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    169 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    170 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    171 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    172 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    173 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    174 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    175 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    176 ; AVX512F-NEXT:    vzeroupper
    177 ; AVX512F-NEXT:    retq
    178 ;
    179 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_1:
    180 ; AVX512VL:       # %bb.0:
    181 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    182 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    183 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    184 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    185 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    186 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    187 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    188 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    189 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    190 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    191 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    192 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    193 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    194 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
    195 ; AVX512VL-NEXT:    vzeroupper
    196 ; AVX512VL-NEXT:    retq
    197 ;
    198 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1:
    199 ; AVX512BW:       # %bb.0:
    200 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    201 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    202 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    203 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    204 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    205 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    206 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    207 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    208 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    209 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    210 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    211 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    212 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    213 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    214 ; AVX512BW-NEXT:    vzeroupper
    215 ; AVX512BW-NEXT:    retq
    216 ;
    217 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1:
    218 ; AVX512BWVL:       # %bb.0:
    219 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    220 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    221 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    222 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
    223 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    224 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    225 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    226 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    227 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
    228 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    229 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    230 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    231 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    232 ; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
    233 ; AVX512BWVL-NEXT:    vzeroupper
    234 ; AVX512BWVL-NEXT:    retq
    235   %vec = load <64 x i8>, <64 x i8>* %L
    236   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
    237   store <16 x i8> %strided.vec, <16 x i8>* %S
    238   ret void
    239 }
    240 
    241 define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
    242 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8_2:
    243 ; AVX512F:       # %bb.0:
    244 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    245 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    246 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    247 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    248 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    249 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    250 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    251 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    252 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    253 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    254 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    255 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    256 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    257 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    258 ; AVX512F-NEXT:    vzeroupper
    259 ; AVX512F-NEXT:    retq
    260 ;
    261 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_2:
    262 ; AVX512VL:       # %bb.0:
    263 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    264 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    265 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    266 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    267 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    268 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    269 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    270 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    271 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    272 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    273 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    274 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    275 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    276 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
    277 ; AVX512VL-NEXT:    vzeroupper
    278 ; AVX512VL-NEXT:    retq
    279 ;
    280 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2:
    281 ; AVX512BW:       # %bb.0:
    282 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    283 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    284 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    285 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    286 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    287 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    288 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    289 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    290 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    291 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    292 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    293 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    294 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    295 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    296 ; AVX512BW-NEXT:    vzeroupper
    297 ; AVX512BW-NEXT:    retq
    298 ;
    299 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2:
    300 ; AVX512BWVL:       # %bb.0:
    301 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    302 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    303 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    304 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
    305 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    306 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    307 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    308 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    309 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
    310 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    311 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    312 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    313 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    314 ; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
    315 ; AVX512BWVL-NEXT:    vzeroupper
    316 ; AVX512BWVL-NEXT:    retq
    317   %vec = load <64 x i8>, <64 x i8>* %L
    318   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
    319   store <16 x i8> %strided.vec, <16 x i8>* %S
    320   ret void
    321 }
    322 
    323 define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
    324 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8_3:
    325 ; AVX512F:       # %bb.0:
    326 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    327 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    328 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    329 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    330 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    331 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    332 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    333 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    334 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    335 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    336 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    337 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    338 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    339 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    340 ; AVX512F-NEXT:    vzeroupper
    341 ; AVX512F-NEXT:    retq
    342 ;
    343 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_3:
    344 ; AVX512VL:       # %bb.0:
    345 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    346 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    347 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    348 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    349 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    350 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    351 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    352 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    353 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    354 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    355 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    356 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    357 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    358 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
    359 ; AVX512VL-NEXT:    vzeroupper
    360 ; AVX512VL-NEXT:    retq
    361 ;
    362 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_3:
    363 ; AVX512BW:       # %bb.0:
    364 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    365 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    366 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    367 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    368 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    369 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    370 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    371 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    372 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    373 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    374 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    375 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    376 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    377 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    378 ; AVX512BW-NEXT:    vzeroupper
    379 ; AVX512BW-NEXT:    retq
    380 ;
    381 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_3:
    382 ; AVX512BWVL:       # %bb.0:
    383 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    384 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    385 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    386 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
    387 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    388 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    389 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    390 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    391 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
    392 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    393 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    394 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    395 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    396 ; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
    397 ; AVX512BWVL-NEXT:    vzeroupper
    398 ; AVX512BWVL-NEXT:    retq
    399   %vec = load <64 x i8>, <64 x i8>* %L
    400   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
    401   store <16 x i8> %strided.vec, <16 x i8>* %S
    402   ret void
    403 }
    404 
    405 define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
    406 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1:
    407 ; AVX512F:       # %bb.0:
    408 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    409 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    410 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    411 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    412 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
    413 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    414 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
    415 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    416 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    417 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    418 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
    419 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    420 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
    421 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    422 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    423 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    424 ; AVX512F-NEXT:    vzeroupper
    425 ; AVX512F-NEXT:    retq
    426 ;
    427 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1:
    428 ; AVX512VL:       # %bb.0:
    429 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    430 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    431 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    432 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
    433 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    434 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    435 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    436 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    437 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
    438 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    439 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    440 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    441 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    442 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
    443 ; AVX512VL-NEXT:    vzeroupper
    444 ; AVX512VL-NEXT:    retq
    445 ;
    446 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
    447 ; AVX512BW:       # %bb.0:
    448 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    449 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    450 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    451 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
    452 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    453 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    454 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    455 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    456 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
    457 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    458 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    459 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    460 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    461 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    462 ; AVX512BW-NEXT:    vzeroupper
    463 ; AVX512BW-NEXT:    retq
    464 ;
    465 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
    466 ; AVX512BWVL:       # %bb.0:
    467 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    468 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    469 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
    470 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
    471 ; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
    472 ; AVX512BWVL-NEXT:    vzeroupper
    473 ; AVX512BWVL-NEXT:    retq
    474   %vec = load <32 x i16>, <32 x i16>* %L
    475   %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
    476   store <8 x i16> %strided.vec, <8 x i16>* %S
    477   ret void
    478 }
    479 
    480 define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
    481 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2:
    482 ; AVX512F:       # %bb.0:
    483 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    484 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    485 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    486 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
    487 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
    488 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    489 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
    490 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    491 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    492 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
    493 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
    494 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    495 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
    496 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    497 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    498 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    499 ; AVX512F-NEXT:    vzeroupper
    500 ; AVX512F-NEXT:    retq
    501 ;
    502 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2:
    503 ; AVX512VL:       # %bb.0:
    504 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    505 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    506 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    507 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
    508 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    509 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    510 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    511 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    512 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
    513 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    514 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    515 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    516 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    517 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
    518 ; AVX512VL-NEXT:    vzeroupper
    519 ; AVX512VL-NEXT:    retq
    520 ;
    521 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
    522 ; AVX512BW:       # %bb.0:
    523 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    524 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    525 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    526 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
    527 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    528 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    529 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    530 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    531 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
    532 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    533 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    534 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    535 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    536 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    537 ; AVX512BW-NEXT:    vzeroupper
    538 ; AVX512BW-NEXT:    retq
    539 ;
    540 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
    541 ; AVX512BWVL:       # %bb.0:
    542 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    543 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    544 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
    545 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
    546 ; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
    547 ; AVX512BWVL-NEXT:    vzeroupper
    548 ; AVX512BWVL-NEXT:    retq
    549   %vec = load <32 x i16>, <32 x i16>* %L
    550   %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
    551   store <8 x i16> %strided.vec, <8 x i16>* %S
    552   ret void
    553 }
    554 
    555 define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
    556 ; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3:
    557 ; AVX512F:       # %bb.0:
    558 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    559 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    560 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    561 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
    562 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
    563 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
    564 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
    565 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    566 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    567 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
    568 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
    569 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
    570 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
    571 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    572 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    573 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    574 ; AVX512F-NEXT:    vzeroupper
    575 ; AVX512F-NEXT:    retq
    576 ;
    577 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3:
    578 ; AVX512VL:       # %bb.0:
    579 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    580 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    581 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    582 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
    583 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    584 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    585 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    586 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    587 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
    588 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    589 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    590 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    591 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    592 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
    593 ; AVX512VL-NEXT:    vzeroupper
    594 ; AVX512VL-NEXT:    retq
    595 ;
    596 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
    597 ; AVX512BW:       # %bb.0:
    598 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    599 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    600 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    601 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
    602 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    603 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    604 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    605 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    606 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
    607 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    608 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    609 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    610 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    611 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    612 ; AVX512BW-NEXT:    vzeroupper
    613 ; AVX512BW-NEXT:    retq
    614 ;
    615 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
    616 ; AVX512BWVL:       # %bb.0:
    617 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    618 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    619 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
    620 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
    621 ; AVX512BWVL-NEXT:    vmovdqa %xmm2, (%rsi)
    622 ; AVX512BWVL-NEXT:    vzeroupper
    623 ; AVX512BWVL-NEXT:    retq
    624   %vec = load <32 x i16>, <32 x i16>* %L
    625   %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
    626   store <8 x i16> %strided.vec, <8 x i16>* %S
    627   ret void
    628 }
    629 
    630 define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
    631 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1:
    632 ; AVX512F:       # %bb.0:
    633 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    634 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    635 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    636 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
    637 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    638 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    639 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    640 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    641 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    642 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    643 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    644 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    645 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    646 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    647 ; AVX512F-NEXT:    vzeroupper
    648 ; AVX512F-NEXT:    retq
    649 ;
    650 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1:
    651 ; AVX512VL:       # %bb.0:
    652 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    653 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    654 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    655 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
    656 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    657 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    658 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    659 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    660 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    661 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    662 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    663 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    664 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    665 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    666 ; AVX512VL-NEXT:    vzeroupper
    667 ; AVX512VL-NEXT:    retq
    668 ;
    669 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
    670 ; AVX512BW:       # %bb.0:
    671 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    672 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    673 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    674 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
    675 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    676 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    677 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    678 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    679 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    680 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    681 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    682 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    683 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    684 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    685 ; AVX512BW-NEXT:    vzeroupper
    686 ; AVX512BW-NEXT:    retq
    687 ;
    688 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
    689 ; AVX512BWVL:       # %bb.0:
    690 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    691 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    692 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    693 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]
    694 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    695 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    696 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    697 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    698 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]
    699 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    700 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    701 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    702 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    703 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
    704 ; AVX512BWVL-NEXT:    vzeroupper
    705 ; AVX512BWVL-NEXT:    retq
    706   %vec = load <64 x i8>, <64 x i8>* %L
    707   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
    708   store <8 x i8> %strided.vec, <8 x i8>* %S
    709   ret void
    710 }
    711 
    712 define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
    713 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2:
    714 ; AVX512F:       # %bb.0:
    715 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    716 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    717 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    718 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
    719 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    720 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    721 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    722 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    723 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    724 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    725 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    726 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    727 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    728 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    729 ; AVX512F-NEXT:    vzeroupper
    730 ; AVX512F-NEXT:    retq
    731 ;
    732 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2:
    733 ; AVX512VL:       # %bb.0:
    734 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    735 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    736 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    737 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
    738 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    739 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    740 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    741 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    742 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    743 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    744 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    745 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    746 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    747 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    748 ; AVX512VL-NEXT:    vzeroupper
    749 ; AVX512VL-NEXT:    retq
    750 ;
    751 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
    752 ; AVX512BW:       # %bb.0:
    753 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    754 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    755 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    756 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
    757 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    758 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    759 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    760 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    761 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    762 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    763 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    764 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    765 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    766 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    767 ; AVX512BW-NEXT:    vzeroupper
    768 ; AVX512BW-NEXT:    retq
    769 ;
    770 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
    771 ; AVX512BWVL:       # %bb.0:
    772 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    773 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    774 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
    775 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
    776 ; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
    777 ; AVX512BWVL-NEXT:    vzeroupper
    778 ; AVX512BWVL-NEXT:    retq
    779   %vec = load <64 x i8>, <64 x i8>* %L
    780   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
    781   store <8 x i8> %strided.vec, <8 x i8>* %S
    782   ret void
    783 }
    784 
    785 define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
    786 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3:
    787 ; AVX512F:       # %bb.0:
    788 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    789 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    790 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    791 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
    792 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    793 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    794 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    795 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    796 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    797 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    798 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    799 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    800 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    801 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    802 ; AVX512F-NEXT:    vzeroupper
    803 ; AVX512F-NEXT:    retq
    804 ;
    805 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3:
    806 ; AVX512VL:       # %bb.0:
    807 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    808 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    809 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    810 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
    811 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    812 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    813 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    814 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    815 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    816 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    817 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    818 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    819 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    820 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    821 ; AVX512VL-NEXT:    vzeroupper
    822 ; AVX512VL-NEXT:    retq
    823 ;
    824 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
    825 ; AVX512BW:       # %bb.0:
    826 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    827 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    828 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    829 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
    830 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    831 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    832 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    833 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    834 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    835 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    836 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    837 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    838 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    839 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    840 ; AVX512BW-NEXT:    vzeroupper
    841 ; AVX512BW-NEXT:    retq
    842 ;
    843 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
    844 ; AVX512BWVL:       # %bb.0:
    845 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    846 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    847 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    848 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]
    849 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    850 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    851 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
    852 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    853 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
    854 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    855 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    856 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    857 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
    858 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
    859 ; AVX512BWVL-NEXT:    vzeroupper
    860 ; AVX512BWVL-NEXT:    retq
    861   %vec = load <64 x i8>, <64 x i8>* %L
    862   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
    863   store <8 x i8> %strided.vec, <8 x i8>* %S
    864   ret void
    865 }
    866 
    867 define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
    868 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4:
    869 ; AVX512F:       # %bb.0:
    870 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    871 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    872 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    873 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
    874 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    875 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    876 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    877 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    878 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    879 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    880 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    881 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    882 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    883 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    884 ; AVX512F-NEXT:    vzeroupper
    885 ; AVX512F-NEXT:    retq
    886 ;
    887 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4:
    888 ; AVX512VL:       # %bb.0:
    889 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    890 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    891 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    892 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
    893 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    894 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    895 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    896 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    897 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    898 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    899 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    900 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    901 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    902 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    903 ; AVX512VL-NEXT:    vzeroupper
    904 ; AVX512VL-NEXT:    retq
    905 ;
    906 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
    907 ; AVX512BW:       # %bb.0:
    908 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    909 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    910 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    911 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
    912 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    913 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    914 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    915 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    916 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    917 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    918 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    919 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    920 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    921 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    922 ; AVX512BW-NEXT:    vzeroupper
    923 ; AVX512BW-NEXT:    retq
    924 ;
    925 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
    926 ; AVX512BWVL:       # %bb.0:
    927 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
    928 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    929 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
    930 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
    931 ; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
    932 ; AVX512BWVL-NEXT:    vzeroupper
    933 ; AVX512BWVL-NEXT:    retq
    934   %vec = load <64 x i8>, <64 x i8>* %L
    935   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
    936   store <8 x i8> %strided.vec, <8 x i8>* %S
    937   ret void
    938 }
    939 
    940 define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
    941 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5:
    942 ; AVX512F:       # %bb.0:
    943 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    944 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
    945 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
    946 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
    947 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    948 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    949 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    950 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
    951 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    952 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    953 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    954 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    955 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    956 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    957 ; AVX512F-NEXT:    vzeroupper
    958 ; AVX512F-NEXT:    retq
    959 ;
    960 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5:
    961 ; AVX512VL:       # %bb.0:
    962 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    963 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
    964 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
    965 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
    966 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    967 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    968 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    969 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
    970 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    971 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    972 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    973 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    974 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    975 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    976 ; AVX512VL-NEXT:    vzeroupper
    977 ; AVX512VL-NEXT:    retq
    978 ;
    979 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
    980 ; AVX512BW:       # %bb.0:
    981 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
    982 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    983 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
    984 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
    985 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    986 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    987 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    988 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
    989 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
    990 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    991 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    992 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    993 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
    994 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    995 ; AVX512BW-NEXT:    vzeroupper
    996 ; AVX512BW-NEXT:    retq
    997 ;
    998 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
    999 ; AVX512BWVL:       # %bb.0:
   1000 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
   1001 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1002 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1003 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7]
   1004 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1005 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1006 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1007 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1008 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7]
   1009 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1010 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1011 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1012 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   1013 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
   1014 ; AVX512BWVL-NEXT:    vzeroupper
   1015 ; AVX512BWVL-NEXT:    retq
   1016   %vec = load <64 x i8>, <64 x i8>* %L
   1017   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
   1018   store <8 x i8> %strided.vec, <8 x i8>* %S
   1019   ret void
   1020 }
   1021 
   1022 define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
   1023 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6:
   1024 ; AVX512F:       # %bb.0:
   1025 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1026 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
   1027 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1028 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
   1029 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1030 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1031 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1032 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1033 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1034 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1035 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1036 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1037 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1038 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
   1039 ; AVX512F-NEXT:    vzeroupper
   1040 ; AVX512F-NEXT:    retq
   1041 ;
   1042 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6:
   1043 ; AVX512VL:       # %bb.0:
   1044 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
   1045 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
   1046 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1047 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
   1048 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1049 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1050 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1051 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1052 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1053 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1054 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1055 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1056 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1057 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
   1058 ; AVX512VL-NEXT:    vzeroupper
   1059 ; AVX512VL-NEXT:    retq
   1060 ;
   1061 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
   1062 ; AVX512BW:       # %bb.0:
   1063 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
   1064 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1065 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1066 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
   1067 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1068 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1069 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1070 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1071 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1072 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1073 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1074 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1075 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1076 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
   1077 ; AVX512BW-NEXT:    vzeroupper
   1078 ; AVX512BW-NEXT:    retq
   1079 ;
   1080 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
   1081 ; AVX512BWVL:       # %bb.0:
   1082 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
   1083 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1084 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
   1085 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
   1086 ; AVX512BWVL-NEXT:    vpmovwb %xmm2, (%rsi)
   1087 ; AVX512BWVL-NEXT:    vzeroupper
   1088 ; AVX512BWVL-NEXT:    retq
   1089   %vec = load <64 x i8>, <64 x i8>* %L
   1090   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
   1091   store <8 x i8> %strided.vec, <8 x i8>* %S
   1092   ret void
   1093 }
   1094 
   1095 define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
   1096 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7:
   1097 ; AVX512F:       # %bb.0:
   1098 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1099 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
   1100 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1101 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
   1102 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1103 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1104 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1105 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1106 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1107 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1108 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1109 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1110 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1111 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
   1112 ; AVX512F-NEXT:    vzeroupper
   1113 ; AVX512F-NEXT:    retq
   1114 ;
   1115 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7:
   1116 ; AVX512VL:       # %bb.0:
   1117 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
   1118 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
   1119 ; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1120 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
   1121 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1122 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1123 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1124 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1125 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1126 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1127 ; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1128 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1129 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1130 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
   1131 ; AVX512VL-NEXT:    vzeroupper
   1132 ; AVX512VL-NEXT:    retq
   1133 ;
   1134 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
   1135 ; AVX512BW:       # %bb.0:
   1136 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
   1137 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1138 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1139 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
   1140 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1141 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1142 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1143 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1144 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1145 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1146 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1147 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1148 ; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
   1149 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
   1150 ; AVX512BW-NEXT:    vzeroupper
   1151 ; AVX512BW-NEXT:    retq
   1152 ;
   1153 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
   1154 ; AVX512BWVL:       # %bb.0:
   1155 ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
   1156 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1157 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1158 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]
   1159 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1160 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1161 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1162 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1163 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
   1164 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1165 ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1166 ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1167 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
   1168 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
   1169 ; AVX512BWVL-NEXT:    vzeroupper
   1170 ; AVX512BWVL-NEXT:    retq
   1171   %vec = load <64 x i8>, <64 x i8>* %L
   1172   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
   1173   store <8 x i8> %strided.vec, <8 x i8>* %S
   1174   ret void
   1175 }
   1176 
   1177