Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
     10 
     11 ; PR31551
     12 ; Pairs of shufflevector:trunc functions with functional equivalence.
     13 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
     14 
     15 define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
     16 ; AVX1-LABEL: shuffle_v32i8_to_v16i8:
     17 ; AVX1:       # %bb.0:
     18 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
     19 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     20 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
     21 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     22 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     23 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     24 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
     25 ; AVX1-NEXT:    vzeroupper
     26 ; AVX1-NEXT:    retq
     27 ;
     28 ; AVX2-LABEL: shuffle_v32i8_to_v16i8:
     29 ; AVX2:       # %bb.0:
     30 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
     31 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
     32 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
     33 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     34 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     35 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     36 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
     37 ; AVX2-NEXT:    vzeroupper
     38 ; AVX2-NEXT:    retq
     39 ;
     40 ; AVX512-LABEL: shuffle_v32i8_to_v16i8:
     41 ; AVX512:       # %bb.0:
     42 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
     43 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
     44 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
     45 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     46 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     47 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     48 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
     49 ; AVX512-NEXT:    vzeroupper
     50 ; AVX512-NEXT:    retq
     51   %vec = load <32 x i8>, <32 x i8>* %L
     52   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
     53   store <16 x i8> %strided.vec, <16 x i8>* %S
     54   ret void
     55 }
     56 
     57 define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
     58 ; AVX1-LABEL: trunc_v16i16_to_v16i8:
     59 ; AVX1:       # %bb.0:
     60 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
     61 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     62 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
     63 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     64 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     65 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     66 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
     67 ; AVX1-NEXT:    vzeroupper
     68 ; AVX1-NEXT:    retq
     69 ;
     70 ; AVX2-LABEL: trunc_v16i16_to_v16i8:
     71 ; AVX2:       # %bb.0:
     72 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
     73 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
     74 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
     75 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
     76 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
     77 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     78 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
     79 ; AVX2-NEXT:    vzeroupper
     80 ; AVX2-NEXT:    retq
     81 ;
     82 ; AVX512F-LABEL: trunc_v16i16_to_v16i8:
     83 ; AVX512F:       # %bb.0:
     84 ; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
     85 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
     86 ; AVX512F-NEXT:    vzeroupper
     87 ; AVX512F-NEXT:    retq
     88 ;
     89 ; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
     90 ; AVX512VL:       # %bb.0:
     91 ; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
     92 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
     93 ; AVX512VL-NEXT:    vzeroupper
     94 ; AVX512VL-NEXT:    retq
     95 ;
     96 ; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
     97 ; AVX512BW:       # %bb.0:
     98 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
     99 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
    100 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    101 ; AVX512BW-NEXT:    vzeroupper
    102 ; AVX512BW-NEXT:    retq
    103 ;
    104 ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
    105 ; AVX512BWVL:       # %bb.0:
    106 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    107 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
    108 ; AVX512BWVL-NEXT:    vzeroupper
    109 ; AVX512BWVL-NEXT:    retq
    110 ;
    111 ; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
    112 ; AVX512VBMIVL:       # %bb.0:
    113 ; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
    114 ; AVX512VBMIVL-NEXT:    vpmovwb %ymm0, (%rsi)
    115 ; AVX512VBMIVL-NEXT:    vzeroupper
    116 ; AVX512VBMIVL-NEXT:    retq
    117   %vec = load <32 x i8>, <32 x i8>* %L
    118   %bc = bitcast <32 x i8> %vec to <16 x i16>
    119   %strided.vec = trunc <16 x i16> %bc to <16 x i8>
    120   store <16 x i8> %strided.vec, <16 x i8>* %S
    121   ret void
    122 }
    123 
    124 define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
    125 ; AVX1-LABEL: shuffle_v16i16_to_v8i16:
    126 ; AVX1:       # %bb.0:
    127 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    128 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    129 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    130 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    131 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    132 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    133 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
    134 ; AVX1-NEXT:    vzeroupper
    135 ; AVX1-NEXT:    retq
    136 ;
    137 ; AVX2-LABEL: shuffle_v16i16_to_v8i16:
    138 ; AVX2:       # %bb.0:
    139 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    140 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    141 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    142 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    143 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    144 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    145 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
    146 ; AVX2-NEXT:    vzeroupper
    147 ; AVX2-NEXT:    retq
    148 ;
    149 ; AVX512-LABEL: shuffle_v16i16_to_v8i16:
    150 ; AVX512:       # %bb.0:
    151 ; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
    152 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    153 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    154 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    155 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    156 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    157 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
    158 ; AVX512-NEXT:    vzeroupper
    159 ; AVX512-NEXT:    retq
    160   %vec = load <16 x i16>, <16 x i16>* %L
    161   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
    162   store <8 x i16> %strided.vec, <8 x i16>* %S
    163   ret void
    164 }
    165 
    166 define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
    167 ; AVX1-LABEL: trunc_v8i32_to_v8i16:
    168 ; AVX1:       # %bb.0:
    169 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    170 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    171 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    172 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    173 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    174 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    175 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
    176 ; AVX1-NEXT:    vzeroupper
    177 ; AVX1-NEXT:    retq
    178 ;
    179 ; AVX2-LABEL: trunc_v8i32_to_v8i16:
    180 ; AVX2:       # %bb.0:
    181 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    182 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    183 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    184 ; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
    185 ; AVX2-NEXT:    vzeroupper
    186 ; AVX2-NEXT:    retq
    187 ;
    188 ; AVX512F-LABEL: trunc_v8i32_to_v8i16:
    189 ; AVX512F:       # %bb.0:
    190 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    191 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    192 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    193 ; AVX512F-NEXT:    vzeroupper
    194 ; AVX512F-NEXT:    retq
    195 ;
    196 ; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
    197 ; AVX512VL:       # %bb.0:
    198 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    199 ; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
    200 ; AVX512VL-NEXT:    vzeroupper
    201 ; AVX512VL-NEXT:    retq
    202 ;
    203 ; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
    204 ; AVX512BW:       # %bb.0:
    205 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    206 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    207 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    208 ; AVX512BW-NEXT:    vzeroupper
    209 ; AVX512BW-NEXT:    retq
    210 ;
    211 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
    212 ; AVX512BWVL:       # %bb.0:
    213 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    214 ; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
    215 ; AVX512BWVL-NEXT:    vzeroupper
    216 ; AVX512BWVL-NEXT:    retq
    217 ;
    218 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
    219 ; AVX512VBMIVL:       # %bb.0:
    220 ; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
    221 ; AVX512VBMIVL-NEXT:    vpmovdw %ymm0, (%rsi)
    222 ; AVX512VBMIVL-NEXT:    vzeroupper
    223 ; AVX512VBMIVL-NEXT:    retq
    224   %vec = load <16 x i16>, <16 x i16>* %L
    225   %bc = bitcast <16 x i16> %vec to <8 x i32>
    226   %strided.vec = trunc <8 x i32> %bc to <8 x i16>
    227   store <8 x i16> %strided.vec, <8 x i16>* %S
    228   ret void
    229 }
    230 
    231 define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
    232 ; AVX-LABEL: shuffle_v8i32_to_v4i32:
    233 ; AVX:       # %bb.0:
    234 ; AVX-NEXT:    vmovaps (%rdi), %ymm0
    235 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    236 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    237 ; AVX-NEXT:    vmovaps %xmm0, (%rsi)
    238 ; AVX-NEXT:    vzeroupper
    239 ; AVX-NEXT:    retq
    240 ;
    241 ; AVX512-LABEL: shuffle_v8i32_to_v4i32:
    242 ; AVX512:       # %bb.0:
    243 ; AVX512-NEXT:    vmovaps (%rdi), %ymm0
    244 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    245 ; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    246 ; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
    247 ; AVX512-NEXT:    vzeroupper
    248 ; AVX512-NEXT:    retq
    249   %vec = load <8 x i32>, <8 x i32>* %L
    250   %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
    251   store <4 x i32> %strided.vec, <4 x i32>* %S
    252   ret void
    253 }
    254 
    255 define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
    256 ; AVX1-LABEL: trunc_v4i64_to_v4i32:
    257 ; AVX1:       # %bb.0:
    258 ; AVX1-NEXT:    vmovaps (%rdi), %ymm0
    259 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    260 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    261 ; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
    262 ; AVX1-NEXT:    vzeroupper
    263 ; AVX1-NEXT:    retq
    264 ;
    265 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
    266 ; AVX2-SLOW:       # %bb.0:
    267 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
    268 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
    269 ; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rsi)
    270 ; AVX2-SLOW-NEXT:    vzeroupper
    271 ; AVX2-SLOW-NEXT:    retq
    272 ;
    273 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
    274 ; AVX2-FAST:       # %bb.0:
    275 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
    276 ; AVX2-FAST-NEXT:    vpermps (%rdi), %ymm0, %ymm0
    277 ; AVX2-FAST-NEXT:    vmovaps %xmm0, (%rsi)
    278 ; AVX2-FAST-NEXT:    vzeroupper
    279 ; AVX2-FAST-NEXT:    retq
    280 ;
    281 ; AVX512F-LABEL: trunc_v4i64_to_v4i32:
    282 ; AVX512F:       # %bb.0:
    283 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    284 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
    285 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
    286 ; AVX512F-NEXT:    vzeroupper
    287 ; AVX512F-NEXT:    retq
    288 ;
    289 ; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
    290 ; AVX512VL:       # %bb.0:
    291 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    292 ; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
    293 ; AVX512VL-NEXT:    vzeroupper
    294 ; AVX512VL-NEXT:    retq
    295 ;
    296 ; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
    297 ; AVX512BW:       # %bb.0:
    298 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    299 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
    300 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
    301 ; AVX512BW-NEXT:    vzeroupper
    302 ; AVX512BW-NEXT:    retq
    303 ;
    304 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
    305 ; AVX512BWVL:       # %bb.0:
    306 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    307 ; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
    308 ; AVX512BWVL-NEXT:    vzeroupper
    309 ; AVX512BWVL-NEXT:    retq
    310 ;
    311 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
    312 ; AVX512VBMIVL:       # %bb.0:
    313 ; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
    314 ; AVX512VBMIVL-NEXT:    vpmovqd %ymm0, (%rsi)
    315 ; AVX512VBMIVL-NEXT:    vzeroupper
    316 ; AVX512VBMIVL-NEXT:    retq
    317   %vec = load <8 x i32>, <8 x i32>* %L
    318   %bc = bitcast <8 x i32> %vec to <4 x i64>
    319   %strided.vec = trunc <4 x i64> %bc to <4 x i32>
    320   store <4 x i32> %strided.vec, <4 x i32>* %S
    321   ret void
    322 }
    323 
    324 define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
    325 ; AVX1-LABEL: shuffle_v32i8_to_v8i8:
    326 ; AVX1:       # %bb.0:
    327 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    328 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    329 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    330 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    331 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    332 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    333 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    334 ; AVX1-NEXT:    vzeroupper
    335 ; AVX1-NEXT:    retq
    336 ;
    337 ; AVX2-LABEL: shuffle_v32i8_to_v8i8:
    338 ; AVX2:       # %bb.0:
    339 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    340 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    341 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    342 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    343 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    344 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    345 ; AVX2-NEXT:    vmovq %xmm0, (%rsi)
    346 ; AVX2-NEXT:    vzeroupper
    347 ; AVX2-NEXT:    retq
    348 ;
    349 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
    350 ; AVX512F:       # %bb.0:
    351 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    352 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    353 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    354 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    355 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    356 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    357 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    358 ; AVX512F-NEXT:    vzeroupper
    359 ; AVX512F-NEXT:    retq
    360 ;
    361 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
    362 ; AVX512VL:       # %bb.0:
    363 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    364 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    365 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    366 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    367 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    368 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    369 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    370 ; AVX512VL-NEXT:    vzeroupper
    371 ; AVX512VL-NEXT:    retq
    372 ;
    373 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
    374 ; AVX512BW:       # %bb.0:
    375 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    376 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    377 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    378 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    379 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    380 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    381 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    382 ; AVX512BW-NEXT:    vzeroupper
    383 ; AVX512BW-NEXT:    retq
    384 ;
    385 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
    386 ; AVX512BWVL:       # %bb.0:
    387 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    388 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    389 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    390 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    391 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    392 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    393 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
    394 ; AVX512BWVL-NEXT:    vzeroupper
    395 ; AVX512BWVL-NEXT:    retq
    396 ;
    397 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
    398 ; AVX512VBMIVL:       # %bb.0:
    399 ; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
    400 ; AVX512VBMIVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    401 ; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    402 ; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    403 ; AVX512VBMIVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    404 ; AVX512VBMIVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    405 ; AVX512VBMIVL-NEXT:    vpmovwb %xmm0, (%rsi)
    406 ; AVX512VBMIVL-NEXT:    vzeroupper
    407 ; AVX512VBMIVL-NEXT:    retq
    408   %vec = load <32 x i8>, <32 x i8>* %L
    409   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
    410   store <8 x i8> %strided.vec, <8 x i8>* %S
    411   ret void
    412 }
    413 
    414 define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
    415 ; AVX1-LABEL: trunc_v8i32_to_v8i8:
    416 ; AVX1:       # %bb.0:
    417 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
    418 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    419 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    420 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    421 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    422 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    423 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
    424 ; AVX1-NEXT:    vzeroupper
    425 ; AVX1-NEXT:    retq
    426 ;
    427 ; AVX2-LABEL: trunc_v8i32_to_v8i8:
    428 ; AVX2:       # %bb.0:
    429 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
    430 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    431 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    432 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    433 ; AVX2-NEXT:    vmovq %xmm0, (%rsi)
    434 ; AVX2-NEXT:    vzeroupper
    435 ; AVX2-NEXT:    retq
    436 ;
    437 ; AVX512F-LABEL: trunc_v8i32_to_v8i8:
    438 ; AVX512F:       # %bb.0:
    439 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
    440 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    441 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    442 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    443 ; AVX512F-NEXT:    vzeroupper
    444 ; AVX512F-NEXT:    retq
    445 ;
    446 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
    447 ; AVX512VL:       # %bb.0:
    448 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
    449 ; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
    450 ; AVX512VL-NEXT:    vzeroupper
    451 ; AVX512VL-NEXT:    retq
    452 ;
    453 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
    454 ; AVX512BW:       # %bb.0:
    455 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
    456 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    457 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    458 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    459 ; AVX512BW-NEXT:    vzeroupper
    460 ; AVX512BW-NEXT:    retq
    461 ;
    462 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
    463 ; AVX512BWVL:       # %bb.0:
    464 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
    465 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
    466 ; AVX512BWVL-NEXT:    vzeroupper
    467 ; AVX512BWVL-NEXT:    retq
    468 ;
    469 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
    470 ; AVX512VBMIVL:       # %bb.0:
    471 ; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
    472 ; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, (%rsi)
    473 ; AVX512VBMIVL-NEXT:    vzeroupper
    474 ; AVX512VBMIVL-NEXT:    retq
    475   %vec = load <32 x i8>, <32 x i8>* %L
    476   %bc = bitcast <32 x i8> %vec to <8 x i32>
    477   %strided.vec = trunc <8 x i32> %bc to <8 x i8>
    478   store <8 x i8> %strided.vec, <8 x i8>* %S
    479   ret void
    480 }
    481 
    482 define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
    483 ; IR generated from:
    484 ; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
    485 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
    486 ; AVX1:       # %bb.0:
    487 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    488 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    489 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    490 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    491 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    492 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    493 ; AVX1-NEXT:    vzeroupper
    494 ; AVX1-NEXT:    retq
    495 ;
    496 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
    497 ; AVX2:       # %bb.0:
    498 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    499 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    500 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    501 ; AVX2-NEXT:    vzeroupper
    502 ; AVX2-NEXT:    retq
    503 ;
    504 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
    505 ; AVX512F:       # %bb.0:
    506 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    507 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    508 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    509 ; AVX512F-NEXT:    vzeroupper
    510 ; AVX512F-NEXT:    retq
    511 ;
    512 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
    513 ; AVX512VL:       # %bb.0:
    514 ; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
    515 ; AVX512VL-NEXT:    vzeroupper
    516 ; AVX512VL-NEXT:    retq
    517 ;
    518 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
    519 ; AVX512BW:       # %bb.0:
    520 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    521 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    522 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    523 ; AVX512BW-NEXT:    vzeroupper
    524 ; AVX512BW-NEXT:    retq
    525 ;
    526 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
    527 ; AVX512BWVL:       # %bb.0:
    528 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
    529 ; AVX512BWVL-NEXT:    vzeroupper
    530 ; AVX512BWVL-NEXT:    retq
    531 ;
    532 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
    533 ; AVX512VBMIVL:       # %bb.0:
    534 ; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
    535 ; AVX512VBMIVL-NEXT:    vzeroupper
    536 ; AVX512VBMIVL-NEXT:    retq
    537   %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
    538   %bc = bitcast <8 x i8> %truncated.vec to i64
    539   %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
    540   ret <2 x i64> %result
    541 }
    542 
    543 define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
    544 ; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
    545 ; AVX1:       # %bb.0:
    546 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    547 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    548 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    549 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    550 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    551 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    552 ; AVX1-NEXT:    vzeroupper
    553 ; AVX1-NEXT:    retq
    554 ;
    555 ; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
    556 ; AVX2:       # %bb.0:
    557 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    558 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    559 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    560 ; AVX2-NEXT:    vzeroupper
    561 ; AVX2-NEXT:    retq
    562 ;
    563 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
    564 ; AVX512F:       # %bb.0:
    565 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    566 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    567 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    568 ; AVX512F-NEXT:    vzeroupper
    569 ; AVX512F-NEXT:    retq
    570 ;
    571 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
    572 ; AVX512VL:       # %bb.0:
    573 ; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
    574 ; AVX512VL-NEXT:    vzeroupper
    575 ; AVX512VL-NEXT:    retq
    576 ;
    577 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
    578 ; AVX512BW:       # %bb.0:
    579 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    580 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    581 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    582 ; AVX512BW-NEXT:    vzeroupper
    583 ; AVX512BW-NEXT:    retq
    584 ;
    585 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
    586 ; AVX512BWVL:       # %bb.0:
    587 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
    588 ; AVX512BWVL-NEXT:    vzeroupper
    589 ; AVX512BWVL-NEXT:    retq
    590 ;
    591 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
    592 ; AVX512VBMIVL:       # %bb.0:
    593 ; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
    594 ; AVX512VBMIVL-NEXT:    vzeroupper
    595 ; AVX512VBMIVL-NEXT:    retq
    596   %truncated = trunc <8 x i32> %vec to <8 x i8>
    597   %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
    598   %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
    599   %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
    600   ret <16 x i8> %result
    601 }
    602 
    603 define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
    604 ; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
    605 ; AVX1:       # %bb.0:
    606 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    607 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    608 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    609 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    610 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    611 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    612 ; AVX1-NEXT:    vzeroupper
    613 ; AVX1-NEXT:    retq
    614 ;
    615 ; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
    616 ; AVX2:       # %bb.0:
    617 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    618 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    619 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    620 ; AVX2-NEXT:    vzeroupper
    621 ; AVX2-NEXT:    retq
    622 ;
    623 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
    624 ; AVX512F:       # %bb.0:
    625 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    626 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    627 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    628 ; AVX512F-NEXT:    vzeroupper
    629 ; AVX512F-NEXT:    retq
    630 ;
    631 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
    632 ; AVX512VL:       # %bb.0:
    633 ; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
    634 ; AVX512VL-NEXT:    vzeroupper
    635 ; AVX512VL-NEXT:    retq
    636 ;
    637 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
    638 ; AVX512BW:       # %bb.0:
    639 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    640 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    641 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    642 ; AVX512BW-NEXT:    vzeroupper
    643 ; AVX512BW-NEXT:    retq
    644 ;
    645 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
    646 ; AVX512BWVL:       # %bb.0:
    647 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
    648 ; AVX512BWVL-NEXT:    vzeroupper
    649 ; AVX512BWVL-NEXT:    retq
    650 ;
    651 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
    652 ; AVX512VBMIVL:       # %bb.0:
    653 ; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
    654 ; AVX512VBMIVL-NEXT:    vzeroupper
    655 ; AVX512VBMIVL-NEXT:    retq
    656   %truncated = trunc <8 x i32> %vec to <8 x i16>
    657   %bc = bitcast <8 x i16> %truncated to <16 x i8>
    658   %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
    659   ret <16 x i8> %result
    660 }
    661 
    662 define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
    663 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
    664 ; AVX1:       # %bb.0:
    665 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    666 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    667 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    668 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    669 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    670 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    671 ; AVX1-NEXT:    vzeroupper
    672 ; AVX1-NEXT:    retq
    673 ;
    674 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
    675 ; AVX2:       # %bb.0:
    676 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    677 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    678 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    679 ; AVX2-NEXT:    vzeroupper
    680 ; AVX2-NEXT:    retq
    681 ;
    682 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
    683 ; AVX512F:       # %bb.0:
    684 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    685 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    686 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    687 ; AVX512F-NEXT:    vzeroupper
    688 ; AVX512F-NEXT:    retq
    689 ;
    690 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
    691 ; AVX512VL:       # %bb.0:
    692 ; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
    693 ; AVX512VL-NEXT:    vzeroupper
    694 ; AVX512VL-NEXT:    retq
    695 ;
    696 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
    697 ; AVX512BW:       # %bb.0:
    698 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    699 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    700 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
    701 ; AVX512BW-NEXT:    vzeroupper
    702 ; AVX512BW-NEXT:    retq
    703 ;
    704 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
    705 ; AVX512BWVL:       # %bb.0:
    706 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
    707 ; AVX512BWVL-NEXT:    vzeroupper
    708 ; AVX512BWVL-NEXT:    retq
    709 ;
    710 ; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
    711 ; AVX512VBMIVL:       # %bb.0:
    712 ; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
    713 ; AVX512VBMIVL-NEXT:    vzeroupper
    714 ; AVX512VBMIVL-NEXT:    retq
    715   %truncated = trunc <8 x i32> %vec to <8 x i8>
    716   %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    717   ret <16 x i8> %result
    718 }
    719 
    720 define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
    721 ; IR generated from:
    722 ; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
    723 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    724 ; AVX1:       # %bb.0:
    725 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    726 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    727 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    728 ; AVX1-NEXT:    vzeroupper
    729 ; AVX1-NEXT:    retq
    730 ;
    731 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    732 ; AVX2-SLOW:       # %bb.0:
    733 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    734 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    735 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    736 ; AVX2-SLOW-NEXT:    vzeroupper
    737 ; AVX2-SLOW-NEXT:    retq
    738 ;
    739 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    740 ; AVX2-FAST:       # %bb.0:
    741 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    742 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    743 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    744 ; AVX2-FAST-NEXT:    vzeroupper
    745 ; AVX2-FAST-NEXT:    retq
    746 ;
    747 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    748 ; AVX512F:       # %bb.0:
    749 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    750 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
    751 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    752 ; AVX512F-NEXT:    vzeroupper
    753 ; AVX512F-NEXT:    retq
    754 ;
    755 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    756 ; AVX512VL:       # %bb.0:
    757 ; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
    758 ; AVX512VL-NEXT:    vzeroupper
    759 ; AVX512VL-NEXT:    retq
    760 ;
    761 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    762 ; AVX512BW:       # %bb.0:
    763 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    764 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
    765 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    766 ; AVX512BW-NEXT:    vzeroupper
    767 ; AVX512BW-NEXT:    retq
    768 ;
    769 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    770 ; AVX512BWVL:       # %bb.0:
    771 ; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
    772 ; AVX512BWVL-NEXT:    vzeroupper
    773 ; AVX512BWVL-NEXT:    retq
    774 ;
    775 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
    776 ; AVX512VBMIVL:       # %bb.0:
    777 ; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
    778 ; AVX512VBMIVL-NEXT:    vzeroupper
    779 ; AVX512VBMIVL-NEXT:    retq
    780   %truncated = trunc <4 x i64> %vec to <4 x i16>
    781   %bc = bitcast <4 x i16> %truncated to i64
    782   %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
    783   ret <2 x i64> %result
    784 }
    785 
    786 define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
    787 ; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    788 ; AVX1:       # %bb.0:
    789 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    790 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    791 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    792 ; AVX1-NEXT:    vzeroupper
    793 ; AVX1-NEXT:    retq
    794 ;
    795 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    796 ; AVX2-SLOW:       # %bb.0:
    797 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    798 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    799 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    800 ; AVX2-SLOW-NEXT:    vzeroupper
    801 ; AVX2-SLOW-NEXT:    retq
    802 ;
    803 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    804 ; AVX2-FAST:       # %bb.0:
    805 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    806 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    807 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    808 ; AVX2-FAST-NEXT:    vzeroupper
    809 ; AVX2-FAST-NEXT:    retq
    810 ;
    811 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    812 ; AVX512F:       # %bb.0:
    813 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    814 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
    815 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    816 ; AVX512F-NEXT:    vzeroupper
    817 ; AVX512F-NEXT:    retq
    818 ;
    819 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    820 ; AVX512VL:       # %bb.0:
    821 ; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
    822 ; AVX512VL-NEXT:    vzeroupper
    823 ; AVX512VL-NEXT:    retq
    824 ;
    825 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    826 ; AVX512BW:       # %bb.0:
    827 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    828 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
    829 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    830 ; AVX512BW-NEXT:    vzeroupper
    831 ; AVX512BW-NEXT:    retq
    832 ;
    833 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    834 ; AVX512BWVL:       # %bb.0:
    835 ; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
    836 ; AVX512BWVL-NEXT:    vzeroupper
    837 ; AVX512BWVL-NEXT:    retq
    838 ;
    839 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
    840 ; AVX512VBMIVL:       # %bb.0:
    841 ; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
    842 ; AVX512VBMIVL-NEXT:    vzeroupper
    843 ; AVX512VBMIVL-NEXT:    retq
    844   %truncated = trunc <4 x i64> %vec to <4 x i16>
    845   %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
    846   %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
    847   %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
    848   ret <8 x i16> %result
    849 }
    850 
    851 define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
    852 ; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    853 ; AVX1:       # %bb.0:
    854 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    855 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    856 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    857 ; AVX1-NEXT:    vzeroupper
    858 ; AVX1-NEXT:    retq
    859 ;
    860 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    861 ; AVX2-SLOW:       # %bb.0:
    862 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    863 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    864 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    865 ; AVX2-SLOW-NEXT:    vzeroupper
    866 ; AVX2-SLOW-NEXT:    retq
    867 ;
    868 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    869 ; AVX2-FAST:       # %bb.0:
    870 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    871 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    872 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    873 ; AVX2-FAST-NEXT:    vzeroupper
    874 ; AVX2-FAST-NEXT:    retq
    875 ;
    876 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    877 ; AVX512F:       # %bb.0:
    878 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    879 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
    880 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    881 ; AVX512F-NEXT:    vzeroupper
    882 ; AVX512F-NEXT:    retq
    883 ;
    884 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    885 ; AVX512VL:       # %bb.0:
    886 ; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
    887 ; AVX512VL-NEXT:    vzeroupper
    888 ; AVX512VL-NEXT:    retq
    889 ;
    890 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    891 ; AVX512BW:       # %bb.0:
    892 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    893 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
    894 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    895 ; AVX512BW-NEXT:    vzeroupper
    896 ; AVX512BW-NEXT:    retq
    897 ;
    898 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    899 ; AVX512BWVL:       # %bb.0:
    900 ; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
    901 ; AVX512BWVL-NEXT:    vzeroupper
    902 ; AVX512BWVL-NEXT:    retq
    903 ;
    904 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
    905 ; AVX512VBMIVL:       # %bb.0:
    906 ; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
    907 ; AVX512VBMIVL-NEXT:    vzeroupper
    908 ; AVX512VBMIVL-NEXT:    retq
    909   %truncated = trunc <4 x i64> %vec to <4 x i32>
    910   %bc = bitcast <4 x i32> %truncated to <8 x i16>
    911   %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
    912   ret <8 x i16> %result
    913 }
    914 
    915 define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
    916 ; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    917 ; AVX1:       # %bb.0:
    918 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    919 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    920 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    921 ; AVX1-NEXT:    vzeroupper
    922 ; AVX1-NEXT:    retq
    923 ;
    924 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    925 ; AVX2-SLOW:       # %bb.0:
    926 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    927 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    928 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    929 ; AVX2-SLOW-NEXT:    vzeroupper
    930 ; AVX2-SLOW-NEXT:    retq
    931 ;
    932 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    933 ; AVX2-FAST:       # %bb.0:
    934 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    935 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    936 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    937 ; AVX2-FAST-NEXT:    vzeroupper
    938 ; AVX2-FAST-NEXT:    retq
    939 ;
    940 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    941 ; AVX512F:       # %bb.0:
    942 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    943 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
    944 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    945 ; AVX512F-NEXT:    vzeroupper
    946 ; AVX512F-NEXT:    retq
    947 ;
    948 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    949 ; AVX512VL:       # %bb.0:
    950 ; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
    951 ; AVX512VL-NEXT:    vzeroupper
    952 ; AVX512VL-NEXT:    retq
    953 ;
    954 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    955 ; AVX512BW:       # %bb.0:
    956 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    957 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
    958 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
    959 ; AVX512BW-NEXT:    vzeroupper
    960 ; AVX512BW-NEXT:    retq
    961 ;
    962 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    963 ; AVX512BWVL:       # %bb.0:
    964 ; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
    965 ; AVX512BWVL-NEXT:    vzeroupper
    966 ; AVX512BWVL-NEXT:    retq
    967 ;
    968 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
    969 ; AVX512VBMIVL:       # %bb.0:
    970 ; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
    971 ; AVX512VBMIVL-NEXT:    vzeroupper
    972 ; AVX512VBMIVL-NEXT:    retq
    973   %truncated = trunc <4 x i64> %vec to <4 x i16>
    974   %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    975   ret <8 x i16> %result
    976 }
    977 
    978 define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
    979 ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
    980 ; AVX1:       # %bb.0:
    981 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    982 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    983 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
    984 ; AVX1-NEXT:    vzeroupper
    985 ; AVX1-NEXT:    retq
    986 ;
    987 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
    988 ; AVX2-SLOW:       # %bb.0:
    989 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    990 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    991 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
    992 ; AVX2-SLOW-NEXT:    vzeroupper
    993 ; AVX2-SLOW-NEXT:    retq
    994 ;
    995 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
    996 ; AVX2-FAST:       # %bb.0:
    997 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    998 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    999 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
   1000 ; AVX2-FAST-NEXT:    vzeroupper
   1001 ; AVX2-FAST-NEXT:    retq
   1002 ;
   1003 ; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
   1004 ; AVX512F:       # %bb.0:
   1005 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1006 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
   1007 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
   1008 ; AVX512F-NEXT:    vzeroupper
   1009 ; AVX512F-NEXT:    retq
   1010 ;
   1011 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
   1012 ; AVX512VL:       # %bb.0:
   1013 ; AVX512VL-NEXT:    vpmovqb %ymm0, %xmm0
   1014 ; AVX512VL-NEXT:    vzeroupper
   1015 ; AVX512VL-NEXT:    retq
   1016 ;
   1017 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
   1018 ; AVX512BW:       # %bb.0:
   1019 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1020 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
   1021 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
   1022 ; AVX512BW-NEXT:    vzeroupper
   1023 ; AVX512BW-NEXT:    retq
   1024 ;
   1025 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
   1026 ; AVX512BWVL:       # %bb.0:
   1027 ; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
   1028 ; AVX512BWVL-NEXT:    vzeroupper
   1029 ; AVX512BWVL-NEXT:    retq
   1030 ;
   1031 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
   1032 ; AVX512VBMIVL:       # %bb.0:
   1033 ; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, %xmm0
   1034 ; AVX512VBMIVL-NEXT:    vzeroupper
   1035 ; AVX512VBMIVL-NEXT:    retq
   1036   %truncated = trunc <4 x i64> %vec to <4 x i8>
   1037   %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
   1038   ret <16 x i8> %result
   1039 }
   1040 
   1041 define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
   1042 ; AVX1-LABEL: shuffle_v16i16_to_v4i16:
   1043 ; AVX1:       # %bb.0:
   1044 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1045 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1046 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1047 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1048 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1049 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1050 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1051 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
   1052 ; AVX1-NEXT:    vzeroupper
   1053 ; AVX1-NEXT:    retq
   1054 ;
   1055 ; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
   1056 ; AVX2-SLOW:       # %bb.0:
   1057 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
   1058 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1059 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1060 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1061 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1062 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1063 ; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1064 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
   1065 ; AVX2-SLOW-NEXT:    vzeroupper
   1066 ; AVX2-SLOW-NEXT:    retq
   1067 ;
   1068 ; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
   1069 ; AVX2-FAST:       # %bb.0:
   1070 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
   1071 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1072 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
   1073 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1074 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1075 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1076 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
   1077 ; AVX2-FAST-NEXT:    vzeroupper
   1078 ; AVX2-FAST-NEXT:    retq
   1079 ;
   1080 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
   1081 ; AVX512F:       # %bb.0:
   1082 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1083 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1084 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1085 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1086 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1087 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1088 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1089 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
   1090 ; AVX512F-NEXT:    vzeroupper
   1091 ; AVX512F-NEXT:    retq
   1092 ;
   1093 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
   1094 ; AVX512VL:       # %bb.0:
   1095 ; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
   1096 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1097 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1098 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
   1099 ; AVX512VL-NEXT:    vzeroupper
   1100 ; AVX512VL-NEXT:    retq
   1101 ;
   1102 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
   1103 ; AVX512BW:       # %bb.0:
   1104 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
   1105 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1106 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
   1107 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1108 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1109 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1110 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
   1111 ; AVX512BW-NEXT:    vzeroupper
   1112 ; AVX512BW-NEXT:    retq
   1113 ;
   1114 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
   1115 ; AVX512BWVL:       # %bb.0:
   1116 ; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
   1117 ; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1118 ; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1119 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
   1120 ; AVX512BWVL-NEXT:    vzeroupper
   1121 ; AVX512BWVL-NEXT:    retq
   1122 ;
   1123 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
   1124 ; AVX512VBMIVL:       # %bb.0:
   1125 ; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %ymm0
   1126 ; AVX512VBMIVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1127 ; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1128 ; AVX512VBMIVL-NEXT:    vpmovdw %xmm0, (%rsi)
   1129 ; AVX512VBMIVL-NEXT:    vzeroupper
   1130 ; AVX512VBMIVL-NEXT:    retq
   1131   %vec = load <16 x i16>, <16 x i16>* %L
   1132   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   1133   store <4 x i16> %strided.vec, <4 x i16>* %S
   1134   ret void
   1135 }
   1136 
   1137 define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
   1138 ; AVX1-LABEL: trunc_v4i64_to_v4i16:
   1139 ; AVX1:       # %bb.0:
   1140 ; AVX1-NEXT:    vmovaps (%rdi), %ymm0
   1141 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1142 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1143 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1144 ; AVX1-NEXT:    vmovq %xmm0, (%rsi)
   1145 ; AVX1-NEXT:    vzeroupper
   1146 ; AVX1-NEXT:    retq
   1147 ;
   1148 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
   1149 ; AVX2-SLOW:       # %bb.0:
   1150 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
   1151 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1152 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1153 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
   1154 ; AVX2-SLOW-NEXT:    vzeroupper
   1155 ; AVX2-SLOW-NEXT:    retq
   1156 ;
   1157 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
   1158 ; AVX2-FAST:       # %bb.0:
   1159 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
   1160 ; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0
   1161 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1162 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
   1163 ; AVX2-FAST-NEXT:    vzeroupper
   1164 ; AVX2-FAST-NEXT:    retq
   1165 ;
   1166 ; AVX512F-LABEL: trunc_v4i64_to_v4i16:
   1167 ; AVX512F:       # %bb.0:
   1168 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1169 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
   1170 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1171 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
   1172 ; AVX512F-NEXT:    vzeroupper
   1173 ; AVX512F-NEXT:    retq
   1174 ;
   1175 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
   1176 ; AVX512VL:       # %bb.0:
   1177 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
   1178 ; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
   1179 ; AVX512VL-NEXT:    vzeroupper
   1180 ; AVX512VL-NEXT:    retq
   1181 ;
   1182 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
   1183 ; AVX512BW:       # %bb.0:
   1184 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
   1185 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
   1186 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1187 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
   1188 ; AVX512BW-NEXT:    vzeroupper
   1189 ; AVX512BW-NEXT:    retq
   1190 ;
   1191 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
   1192 ; AVX512BWVL:       # %bb.0:
   1193 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
   1194 ; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
   1195 ; AVX512BWVL-NEXT:    vzeroupper
   1196 ; AVX512BWVL-NEXT:    retq
   1197 ;
   1198 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
   1199 ; AVX512VBMIVL:       # %bb.0:
   1200 ; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
   1201 ; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, (%rsi)
   1202 ; AVX512VBMIVL-NEXT:    vzeroupper
   1203 ; AVX512VBMIVL-NEXT:    retq
   1204   %vec = load <16 x i16>, <16 x i16>* %L
   1205   %bc = bitcast <16 x i16> %vec to <4 x i64>
   1206   %strided.vec = trunc <4 x i64> %bc to <4 x i16>
   1207   store <4 x i16> %strided.vec, <4 x i16>* %S
   1208   ret void
   1209 }
   1210 
   1211 define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
   1212 ; AVX1-LABEL: shuffle_v32i8_to_v4i8:
   1213 ; AVX1:       # %bb.0:
   1214 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
   1215 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1216 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1217 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1218 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1219 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1220 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
   1221 ; AVX1-NEXT:    vzeroupper
   1222 ; AVX1-NEXT:    retq
   1223 ;
   1224 ; AVX2-LABEL: shuffle_v32i8_to_v4i8:
   1225 ; AVX2:       # %bb.0:
   1226 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
   1227 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1228 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1229 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1230 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1231 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1232 ; AVX2-NEXT:    vmovd %xmm0, (%rsi)
   1233 ; AVX2-NEXT:    vzeroupper
   1234 ; AVX2-NEXT:    retq
   1235 ;
   1236 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
   1237 ; AVX512F:       # %bb.0:
   1238 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1239 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1240 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1241 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1242 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1243 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1244 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
   1245 ; AVX512F-NEXT:    vzeroupper
   1246 ; AVX512F-NEXT:    retq
   1247 ;
   1248 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
   1249 ; AVX512VL:       # %bb.0:
   1250 ; AVX512VL-NEXT:    vmovaps (%rdi), %ymm0
   1251 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1252 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1253 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
   1254 ; AVX512VL-NEXT:    vzeroupper
   1255 ; AVX512VL-NEXT:    retq
   1256 ;
   1257 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
   1258 ; AVX512BW:       # %bb.0:
   1259 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
   1260 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1261 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
   1262 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1263 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1264 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1265 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
   1266 ; AVX512BW-NEXT:    vzeroupper
   1267 ; AVX512BW-NEXT:    retq
   1268 ;
   1269 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
   1270 ; AVX512BWVL:       # %bb.0:
   1271 ; AVX512BWVL-NEXT:    vmovaps (%rdi), %ymm0
   1272 ; AVX512BWVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1273 ; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1274 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
   1275 ; AVX512BWVL-NEXT:    vzeroupper
   1276 ; AVX512BWVL-NEXT:    retq
   1277 ;
   1278 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
   1279 ; AVX512VBMIVL:       # %bb.0:
   1280 ; AVX512VBMIVL-NEXT:    vmovaps (%rdi), %ymm0
   1281 ; AVX512VBMIVL-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1282 ; AVX512VBMIVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1283 ; AVX512VBMIVL-NEXT:    vpmovdb %xmm0, (%rsi)
   1284 ; AVX512VBMIVL-NEXT:    vzeroupper
   1285 ; AVX512VBMIVL-NEXT:    retq
   1286   %vec = load <32 x i8>, <32 x i8>* %L
   1287   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
   1288   store <4 x i8> %strided.vec, <4 x i8>* %S
   1289   ret void
   1290 }
   1291 
   1292 define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
   1293 ; AVX1-LABEL: trunc_v4i64_to_v4i8:
   1294 ; AVX1:       # %bb.0:
   1295 ; AVX1-NEXT:    vmovaps (%rdi), %ymm0
   1296 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1297 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1298 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
   1299 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
   1300 ; AVX1-NEXT:    vzeroupper
   1301 ; AVX1-NEXT:    retq
   1302 ;
   1303 ; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
   1304 ; AVX2-SLOW:       # %bb.0:
   1305 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
   1306 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1307 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
   1308 ; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
   1309 ; AVX2-SLOW-NEXT:    vzeroupper
   1310 ; AVX2-SLOW-NEXT:    retq
   1311 ;
   1312 ; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
   1313 ; AVX2-FAST:       # %bb.0:
   1314 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
   1315 ; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0
   1316 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
   1317 ; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
   1318 ; AVX2-FAST-NEXT:    vzeroupper
   1319 ; AVX2-FAST-NEXT:    retq
   1320 ;
   1321 ; AVX512F-LABEL: trunc_v4i64_to_v4i8:
   1322 ; AVX512F:       # %bb.0:
   1323 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
   1324 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
   1325 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
   1326 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
   1327 ; AVX512F-NEXT:    vzeroupper
   1328 ; AVX512F-NEXT:    retq
   1329 ;
   1330 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
   1331 ; AVX512VL:       # %bb.0:
   1332 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
   1333 ; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
   1334 ; AVX512VL-NEXT:    vzeroupper
   1335 ; AVX512VL-NEXT:    retq
   1336 ;
   1337 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
   1338 ; AVX512BW:       # %bb.0:
   1339 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
   1340 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
   1341 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
   1342 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
   1343 ; AVX512BW-NEXT:    vzeroupper
   1344 ; AVX512BW-NEXT:    retq
   1345 ;
   1346 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
   1347 ; AVX512BWVL:       # %bb.0:
   1348 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
   1349 ; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
   1350 ; AVX512BWVL-NEXT:    vzeroupper
   1351 ; AVX512BWVL-NEXT:    retq
   1352 ;
   1353 ; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
   1354 ; AVX512VBMIVL:       # %bb.0:
   1355 ; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
   1356 ; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, (%rsi)
   1357 ; AVX512VBMIVL-NEXT:    vzeroupper
   1358 ; AVX512VBMIVL-NEXT:    retq
   1359   %vec = load <32 x i8>, <32 x i8>* %L
   1360   %bc = bitcast <32 x i8> %vec to <4 x i64>
   1361   %strided.vec = trunc <4 x i64> %bc to <4 x i8>
   1362   store <4 x i8> %strided.vec, <4 x i8>* %S
   1363   ret void
   1364 }
   1365 
   1366 ; In this case not all elements are collected from the same source vector, so
   1367 ; the resulting BUILD_VECTOR should not be combined to a truncate.
   1368 define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
   1369 ; AVX1-LABEL: negative:
   1370 ; AVX1:       # %bb.0:
   1371 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
   1372 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1373 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
   1374 ; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
   1375 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
   1376 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
   1377 ; AVX1-NEXT:    vzeroupper
   1378 ; AVX1-NEXT:    retq
   1379 ;
   1380 ; AVX2-LABEL: negative:
   1381 ; AVX2:       # %bb.0:
   1382 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
   1383 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
   1384 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
   1385 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
   1386 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1387 ; AVX2-NEXT:    vzeroupper
   1388 ; AVX2-NEXT:    retq
   1389 ;
   1390 ; AVX512F-LABEL: negative:
   1391 ; AVX512F:       # %bb.0:
   1392 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
   1393 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
   1394 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
   1395 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
   1396 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1397 ; AVX512F-NEXT:    vzeroupper
   1398 ; AVX512F-NEXT:    retq
   1399 ;
   1400 ; AVX512VL-LABEL: negative:
   1401 ; AVX512VL:       # %bb.0:
   1402 ; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
   1403 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
   1404 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
   1405 ; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
   1406 ; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1407 ; AVX512VL-NEXT:    vzeroupper
   1408 ; AVX512VL-NEXT:    retq
   1409 ;
   1410 ; AVX512BW-LABEL: negative:
   1411 ; AVX512BW:       # %bb.0:
   1412 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
   1413 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
   1414 ; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
   1415 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
   1416 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1417 ; AVX512BW-NEXT:    vzeroupper
   1418 ; AVX512BW-NEXT:    retq
   1419 ;
   1420 ; AVX512BWVL-LABEL: negative:
   1421 ; AVX512BWVL:       # %bb.0:
   1422 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
   1423 ; AVX512BWVL-NEXT:    movl $65537, %eax # imm = 0x10001
   1424 ; AVX512BWVL-NEXT:    kmovd %eax, %k1
   1425 ; AVX512BWVL-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
   1426 ; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
   1427 ; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1428 ; AVX512BWVL-NEXT:    vzeroupper
   1429 ; AVX512BWVL-NEXT:    retq
   1430 ;
   1431 ; AVX512VBMIVL-LABEL: negative:
   1432 ; AVX512VBMIVL:       # %bb.0:
   1433 ; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
   1434 ; AVX512VBMIVL-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
   1435 ; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1436 ; AVX512VBMIVL-NEXT:    vzeroupper
   1437 ; AVX512VBMIVL-NEXT:    retq
   1438   %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   1439   %w0 = extractelement <32 x i8> %w, i32 0
   1440   %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
   1441   ret <16 x i8> %merged
   1442 }
   1443