Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
     12 
     13 define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
     14 ; SSE-LABEL: trunc8i64_8i32:
     15 ; SSE:       # %bb.0: # %entry
     16 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
     17 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
     18 ; SSE-NEXT:    movaps %xmm2, %xmm1
     19 ; SSE-NEXT:    retq
     20 ;
     21 ; AVX1-LABEL: trunc8i64_8i32:
     22 ; AVX1:       # %bb.0: # %entry
     23 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
     24 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
     25 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
     26 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
     27 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     28 ; AVX1-NEXT:    retq
     29 ;
     30 ; AVX2-SLOW-LABEL: trunc8i64_8i32:
     31 ; AVX2-SLOW:       # %bb.0: # %entry
     32 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
     33 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
     34 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
     35 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
     36 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     37 ; AVX2-SLOW-NEXT:    retq
     38 ;
     39 ; AVX2-FAST-LABEL: trunc8i64_8i32:
     40 ; AVX2-FAST:       # %bb.0: # %entry
     41 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
     42 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
     43 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
     44 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     45 ; AVX2-FAST-NEXT:    retq
     46 ;
     47 ; AVX512-LABEL: trunc8i64_8i32:
     48 ; AVX512:       # %bb.0: # %entry
     49 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
     50 ; AVX512-NEXT:    retq
     51 entry:
     52   %0 = trunc <8 x i64> %a to <8 x i32>
     53   ret <8 x i32> %0
     54 }
     55 
     56 define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
     57 ; SSE-LABEL: trunc8i64_8i32_ashr:
     58 ; SSE:       # %bb.0: # %entry
     59 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
     60 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
     61 ; SSE-NEXT:    movaps %xmm2, %xmm1
     62 ; SSE-NEXT:    retq
     63 ;
     64 ; AVX1-LABEL: trunc8i64_8i32_ashr:
     65 ; AVX1:       # %bb.0: # %entry
     66 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
     67 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
     68 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
     69 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
     70 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     71 ; AVX1-NEXT:    retq
     72 ;
     73 ; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
     74 ; AVX2-SLOW:       # %bb.0: # %entry
     75 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
     76 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
     77 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
     78 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
     79 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     80 ; AVX2-SLOW-NEXT:    retq
     81 ;
     82 ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
     83 ; AVX2-FAST:       # %bb.0: # %entry
     84 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
     85 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
     86 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
     87 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     88 ; AVX2-FAST-NEXT:    retq
     89 ;
     90 ; AVX512-LABEL: trunc8i64_8i32_ashr:
     91 ; AVX512:       # %bb.0: # %entry
     92 ; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
     93 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
     94 ; AVX512-NEXT:    retq
     95 entry:
     96   %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
     97   %1 = trunc <8 x i64> %0 to <8 x i32>
     98   ret <8 x i32> %1
     99 }
    100 
    101 define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
    102 ; SSE-LABEL: trunc8i64_8i32_lshr:
    103 ; SSE:       # %bb.0: # %entry
    104 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
    105 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
    106 ; SSE-NEXT:    movaps %xmm2, %xmm1
    107 ; SSE-NEXT:    retq
    108 ;
    109 ; AVX1-LABEL: trunc8i64_8i32_lshr:
    110 ; AVX1:       # %bb.0: # %entry
    111 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    112 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    113 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
    114 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
    115 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    116 ; AVX1-NEXT:    retq
    117 ;
    118 ; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
    119 ; AVX2-SLOW:       # %bb.0: # %entry
    120 ; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm1, %ymm1
    121 ; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm0, %ymm0
    122 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    123 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    124 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    125 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    126 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    127 ; AVX2-SLOW-NEXT:    retq
    128 ;
    129 ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
    130 ; AVX2-FAST:       # %bb.0: # %entry
    131 ; AVX2-FAST-NEXT:    vpsrlq $32, %ymm1, %ymm1
    132 ; AVX2-FAST-NEXT:    vpsrlq $32, %ymm0, %ymm0
    133 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    134 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
    135 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    136 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    137 ; AVX2-FAST-NEXT:    retq
    138 ;
    139 ; AVX512-LABEL: trunc8i64_8i32_lshr:
    140 ; AVX512:       # %bb.0: # %entry
    141 ; AVX512-NEXT:    vpsrlq $32, %zmm0, %zmm0
    142 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
    143 ; AVX512-NEXT:    retq
    144 entry:
    145   %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
    146   %1 = trunc <8 x i64> %0 to <8 x i32>
    147   ret <8 x i32> %1
    148 }
    149 
    150 define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
    151 ; SSE2-LABEL: trunc8i64_8i16:
    152 ; SSE2:       # %bb.0: # %entry
    153 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    154 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
    155 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    156 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
    157 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
    158 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
    159 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
    160 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    161 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
    162 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    163 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
    164 ; SSE2-NEXT:    retq
    165 ;
    166 ; SSSE3-LABEL: trunc8i64_8i16:
    167 ; SSSE3:       # %bb.0: # %entry
    168 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    169 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
    170 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    171 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
    172 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
    173 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
    174 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
    175 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    176 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
    177 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    178 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
    179 ; SSSE3-NEXT:    retq
    180 ;
    181 ; SSE41-LABEL: trunc8i64_8i16:
    182 ; SSE41:       # %bb.0: # %entry
    183 ; SSE41-NEXT:    pxor %xmm4, %xmm4
    184 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
    185 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
    186 ; SSE41-NEXT:    packusdw %xmm3, %xmm2
    187 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
    188 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
    189 ; SSE41-NEXT:    packusdw %xmm1, %xmm0
    190 ; SSE41-NEXT:    packusdw %xmm2, %xmm0
    191 ; SSE41-NEXT:    retq
    192 ;
    193 ; AVX1-LABEL: trunc8i64_8i16:
    194 ; AVX1:       # %bb.0: # %entry
    195 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    196 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    197 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
    198 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
    199 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    200 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    201 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
    202 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
    203 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    204 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    205 ; AVX1-NEXT:    vzeroupper
    206 ; AVX1-NEXT:    retq
    207 ;
    208 ; AVX2-SLOW-LABEL: trunc8i64_8i16:
    209 ; AVX2-SLOW:       # %bb.0: # %entry
    210 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    211 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    212 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    213 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    214 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    215 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    216 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    217 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    218 ; AVX2-SLOW-NEXT:    vzeroupper
    219 ; AVX2-SLOW-NEXT:    retq
    220 ;
    221 ; AVX2-FAST-LABEL: trunc8i64_8i16:
    222 ; AVX2-FAST:       # %bb.0: # %entry
    223 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    224 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
    225 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    226 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    227 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    228 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    229 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    230 ; AVX2-FAST-NEXT:    vzeroupper
    231 ; AVX2-FAST-NEXT:    retq
    232 ;
    233 ; AVX512-LABEL: trunc8i64_8i16:
    234 ; AVX512:       # %bb.0: # %entry
    235 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
    236 ; AVX512-NEXT:    vzeroupper
    237 ; AVX512-NEXT:    retq
    238 entry:
    239   %0 = trunc <8 x i64> %a to <8 x i16>
    240   ret <8 x i16> %0
    241 }
    242 
    243 define void @trunc8i64_8i8(<8 x i64> %a) {
    244 ; SSE2-LABEL: trunc8i64_8i8:
    245 ; SSE2:       # %bb.0: # %entry
    246 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    247 ; SSE2-NEXT:    pand %xmm4, %xmm3
    248 ; SSE2-NEXT:    pand %xmm4, %xmm2
    249 ; SSE2-NEXT:    packuswb %xmm3, %xmm2
    250 ; SSE2-NEXT:    pand %xmm4, %xmm1
    251 ; SSE2-NEXT:    pand %xmm4, %xmm0
    252 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
    253 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    254 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    255 ; SSE2-NEXT:    movq %xmm0, (%rax)
    256 ; SSE2-NEXT:    retq
    257 ;
    258 ; SSSE3-LABEL: trunc8i64_8i8:
    259 ; SSSE3:       # %bb.0: # %entry
    260 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    261 ; SSSE3-NEXT:    pand %xmm4, %xmm3
    262 ; SSSE3-NEXT:    pand %xmm4, %xmm2
    263 ; SSSE3-NEXT:    packuswb %xmm3, %xmm2
    264 ; SSSE3-NEXT:    pand %xmm4, %xmm1
    265 ; SSSE3-NEXT:    pand %xmm4, %xmm0
    266 ; SSSE3-NEXT:    packuswb %xmm1, %xmm0
    267 ; SSSE3-NEXT:    packuswb %xmm2, %xmm0
    268 ; SSSE3-NEXT:    packuswb %xmm0, %xmm0
    269 ; SSSE3-NEXT:    movq %xmm0, (%rax)
    270 ; SSSE3-NEXT:    retq
    271 ;
    272 ; SSE41-LABEL: trunc8i64_8i8:
    273 ; SSE41:       # %bb.0: # %entry
    274 ; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    275 ; SSE41-NEXT:    pand %xmm4, %xmm3
    276 ; SSE41-NEXT:    pand %xmm4, %xmm2
    277 ; SSE41-NEXT:    packusdw %xmm3, %xmm2
    278 ; SSE41-NEXT:    pand %xmm4, %xmm1
    279 ; SSE41-NEXT:    pand %xmm4, %xmm0
    280 ; SSE41-NEXT:    packusdw %xmm1, %xmm0
    281 ; SSE41-NEXT:    packusdw %xmm2, %xmm0
    282 ; SSE41-NEXT:    packuswb %xmm0, %xmm0
    283 ; SSE41-NEXT:    movq %xmm0, (%rax)
    284 ; SSE41-NEXT:    retq
    285 ;
    286 ; AVX1-LABEL: trunc8i64_8i8:
    287 ; AVX1:       # %bb.0: # %entry
    288 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    289 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    290 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    291 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    292 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    293 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    294 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    295 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    296 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    297 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    298 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
    299 ; AVX1-NEXT:    vmovq %xmm0, (%rax)
    300 ; AVX1-NEXT:    vzeroupper
    301 ; AVX1-NEXT:    retq
    302 ;
    303 ; AVX2-SLOW-LABEL: trunc8i64_8i8:
    304 ; AVX2-SLOW:       # %bb.0: # %entry
    305 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    306 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    307 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    308 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    309 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    310 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    311 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    312 ; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    313 ; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rax)
    314 ; AVX2-SLOW-NEXT:    vzeroupper
    315 ; AVX2-SLOW-NEXT:    retq
    316 ;
    317 ; AVX2-FAST-LABEL: trunc8i64_8i8:
    318 ; AVX2-FAST:       # %bb.0: # %entry
    319 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    320 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
    321 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    322 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    323 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    324 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    325 ; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    326 ; AVX2-FAST-NEXT:    vmovq %xmm0, (%rax)
    327 ; AVX2-FAST-NEXT:    vzeroupper
    328 ; AVX2-FAST-NEXT:    retq
    329 ;
    330 ; AVX512-LABEL: trunc8i64_8i8:
    331 ; AVX512:       # %bb.0: # %entry
    332 ; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
    333 ; AVX512-NEXT:    vzeroupper
    334 ; AVX512-NEXT:    retq
    335 entry:
    336   %0 = trunc <8 x i64> %a to <8 x i8>
    337   store <8 x i8> %0, <8 x i8>* undef, align 4
    338   ret void
    339 }
    340 
    341 define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
    342 ; SSE2-LABEL: trunc8i32_8i16:
    343 ; SSE2:       # %bb.0: # %entry
    344 ; SSE2-NEXT:    pslld $16, %xmm1
    345 ; SSE2-NEXT:    psrad $16, %xmm1
    346 ; SSE2-NEXT:    pslld $16, %xmm0
    347 ; SSE2-NEXT:    psrad $16, %xmm0
    348 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
    349 ; SSE2-NEXT:    retq
    350 ;
    351 ; SSSE3-LABEL: trunc8i32_8i16:
    352 ; SSSE3:       # %bb.0: # %entry
    353 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    354 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
    355 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
    356 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    357 ; SSSE3-NEXT:    retq
    358 ;
    359 ; SSE41-LABEL: trunc8i32_8i16:
    360 ; SSE41:       # %bb.0: # %entry
    361 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    362 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
    363 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
    364 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    365 ; SSE41-NEXT:    retq
    366 ;
    367 ; AVX1-LABEL: trunc8i32_8i16:
    368 ; AVX1:       # %bb.0: # %entry
    369 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    370 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    371 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    372 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    373 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    374 ; AVX1-NEXT:    vzeroupper
    375 ; AVX1-NEXT:    retq
    376 ;
    377 ; AVX2-LABEL: trunc8i32_8i16:
    378 ; AVX2:       # %bb.0: # %entry
    379 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    380 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    381 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    382 ; AVX2-NEXT:    vzeroupper
    383 ; AVX2-NEXT:    retq
    384 ;
    385 ; AVX512F-LABEL: trunc8i32_8i16:
    386 ; AVX512F:       # %bb.0: # %entry
    387 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    388 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    389 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    390 ; AVX512F-NEXT:    vzeroupper
    391 ; AVX512F-NEXT:    retq
    392 ;
    393 ; AVX512VL-LABEL: trunc8i32_8i16:
    394 ; AVX512VL:       # %bb.0: # %entry
    395 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
    396 ; AVX512VL-NEXT:    vzeroupper
    397 ; AVX512VL-NEXT:    retq
    398 ;
    399 ; AVX512BW-LABEL: trunc8i32_8i16:
    400 ; AVX512BW:       # %bb.0: # %entry
    401 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    402 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    403 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    404 ; AVX512BW-NEXT:    vzeroupper
    405 ; AVX512BW-NEXT:    retq
    406 ;
    407 ; AVX512BWVL-LABEL: trunc8i32_8i16:
    408 ; AVX512BWVL:       # %bb.0: # %entry
    409 ; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
    410 ; AVX512BWVL-NEXT:    vzeroupper
    411 ; AVX512BWVL-NEXT:    retq
    412 entry:
    413   %0 = trunc <8 x i32> %a to <8 x i16>
    414   ret <8 x i16> %0
    415 }
    416 
    417 define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
    418 ; SSE-LABEL: trunc8i32_8i16_ashr:
    419 ; SSE:       # %bb.0: # %entry
    420 ; SSE-NEXT:    psrad $16, %xmm1
    421 ; SSE-NEXT:    psrad $16, %xmm0
    422 ; SSE-NEXT:    packssdw %xmm1, %xmm0
    423 ; SSE-NEXT:    retq
    424 ;
    425 ; AVX1-LABEL: trunc8i32_8i16_ashr:
    426 ; AVX1:       # %bb.0: # %entry
    427 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    428 ; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
    429 ; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
    430 ; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    431 ; AVX1-NEXT:    vzeroupper
    432 ; AVX1-NEXT:    retq
    433 ;
    434 ; AVX2-LABEL: trunc8i32_8i16_ashr:
    435 ; AVX2:       # %bb.0: # %entry
    436 ; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
    437 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    438 ; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
    439 ; AVX2-NEXT:    vzeroupper
    440 ; AVX2-NEXT:    retq
    441 ;
    442 ; AVX512F-LABEL: trunc8i32_8i16_ashr:
    443 ; AVX512F:       # %bb.0: # %entry
    444 ; AVX512F-NEXT:    vpsrad $16, %ymm0, %ymm0
    445 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    446 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    447 ; AVX512F-NEXT:    vzeroupper
    448 ; AVX512F-NEXT:    retq
    449 ;
    450 ; AVX512VL-LABEL: trunc8i32_8i16_ashr:
    451 ; AVX512VL:       # %bb.0: # %entry
    452 ; AVX512VL-NEXT:    vpsrad $16, %ymm0, %ymm0
    453 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
    454 ; AVX512VL-NEXT:    vzeroupper
    455 ; AVX512VL-NEXT:    retq
    456 ;
    457 ; AVX512BW-LABEL: trunc8i32_8i16_ashr:
    458 ; AVX512BW:       # %bb.0: # %entry
    459 ; AVX512BW-NEXT:    vpsrad $16, %ymm0, %ymm0
    460 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    461 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    462 ; AVX512BW-NEXT:    vzeroupper
    463 ; AVX512BW-NEXT:    retq
    464 ;
    465 ; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
    466 ; AVX512BWVL:       # %bb.0: # %entry
    467 ; AVX512BWVL-NEXT:    vpsrad $16, %ymm0, %ymm0
    468 ; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
    469 ; AVX512BWVL-NEXT:    vzeroupper
    470 ; AVX512BWVL-NEXT:    retq
    471 entry:
    472   %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    473   %1 = trunc <8 x i32> %0 to <8 x i16>
    474   ret <8 x i16> %1
    475 }
    476 
    477 define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
    478 ; SSE2-LABEL: trunc8i32_8i16_lshr:
    479 ; SSE2:       # %bb.0: # %entry
    480 ; SSE2-NEXT:    psrld $16, %xmm0
    481 ; SSE2-NEXT:    psrld $16, %xmm1
    482 ; SSE2-NEXT:    pslld $16, %xmm1
    483 ; SSE2-NEXT:    psrad $16, %xmm1
    484 ; SSE2-NEXT:    pslld $16, %xmm0
    485 ; SSE2-NEXT:    psrad $16, %xmm0
    486 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
    487 ; SSE2-NEXT:    retq
    488 ;
    489 ; SSSE3-LABEL: trunc8i32_8i16_lshr:
    490 ; SSSE3:       # %bb.0: # %entry
    491 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255]
    492 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
    493 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
    494 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    495 ; SSSE3-NEXT:    retq
    496 ;
    497 ; SSE41-LABEL: trunc8i32_8i16_lshr:
    498 ; SSE41:       # %bb.0: # %entry
    499 ; SSE41-NEXT:    psrld $16, %xmm1
    500 ; SSE41-NEXT:    psrld $16, %xmm0
    501 ; SSE41-NEXT:    packusdw %xmm1, %xmm0
    502 ; SSE41-NEXT:    retq
    503 ;
    504 ; AVX1-LABEL: trunc8i32_8i16_lshr:
    505 ; AVX1:       # %bb.0: # %entry
    506 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    507 ; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
    508 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
    509 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    510 ; AVX1-NEXT:    vzeroupper
    511 ; AVX1-NEXT:    retq
    512 ;
    513 ; AVX2-LABEL: trunc8i32_8i16_lshr:
    514 ; AVX2:       # %bb.0: # %entry
    515 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
    516 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    517 ; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    518 ; AVX2-NEXT:    vzeroupper
    519 ; AVX2-NEXT:    retq
    520 ;
    521 ; AVX512F-LABEL: trunc8i32_8i16_lshr:
    522 ; AVX512F:       # %bb.0: # %entry
    523 ; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
    524 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    525 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    526 ; AVX512F-NEXT:    vzeroupper
    527 ; AVX512F-NEXT:    retq
    528 ;
    529 ; AVX512VL-LABEL: trunc8i32_8i16_lshr:
    530 ; AVX512VL:       # %bb.0: # %entry
    531 ; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
    532 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
    533 ; AVX512VL-NEXT:    vzeroupper
    534 ; AVX512VL-NEXT:    retq
    535 ;
    536 ; AVX512BW-LABEL: trunc8i32_8i16_lshr:
    537 ; AVX512BW:       # %bb.0: # %entry
    538 ; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
    539 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    540 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    541 ; AVX512BW-NEXT:    vzeroupper
    542 ; AVX512BW-NEXT:    retq
    543 ;
    544 ; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
    545 ; AVX512BWVL:       # %bb.0: # %entry
    546 ; AVX512BWVL-NEXT:    vpsrld $16, %ymm0, %ymm0
    547 ; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
    548 ; AVX512BWVL-NEXT:    vzeroupper
    549 ; AVX512BWVL-NEXT:    retq
    550 entry:
    551   %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    552   %1 = trunc <8 x i32> %0 to <8 x i16>
    553   ret <8 x i16> %1
    554 }
    555 
    556 define void @trunc8i32_8i8(<8 x i32> %a) {
    557 ; SSE2-LABEL: trunc8i32_8i8:
    558 ; SSE2:       # %bb.0: # %entry
    559 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    560 ; SSE2-NEXT:    pand %xmm2, %xmm1
    561 ; SSE2-NEXT:    pand %xmm2, %xmm0
    562 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
    563 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    564 ; SSE2-NEXT:    movq %xmm0, (%rax)
    565 ; SSE2-NEXT:    retq
    566 ;
    567 ; SSSE3-LABEL: trunc8i32_8i8:
    568 ; SSSE3:       # %bb.0: # %entry
    569 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    570 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
    571 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
    572 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    573 ; SSSE3-NEXT:    movq %xmm0, (%rax)
    574 ; SSSE3-NEXT:    retq
    575 ;
    576 ; SSE41-LABEL: trunc8i32_8i8:
    577 ; SSE41:       # %bb.0: # %entry
    578 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    579 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
    580 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
    581 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    582 ; SSE41-NEXT:    movq %xmm0, (%rax)
    583 ; SSE41-NEXT:    retq
    584 ;
    585 ; AVX1-LABEL: trunc8i32_8i8:
    586 ; AVX1:       # %bb.0: # %entry
    587 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    588 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
    589 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    590 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    591 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    592 ; AVX1-NEXT:    vmovq %xmm0, (%rax)
    593 ; AVX1-NEXT:    vzeroupper
    594 ; AVX1-NEXT:    retq
    595 ;
    596 ; AVX2-LABEL: trunc8i32_8i8:
    597 ; AVX2:       # %bb.0: # %entry
    598 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    599 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    600 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    601 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
    602 ; AVX2-NEXT:    vzeroupper
    603 ; AVX2-NEXT:    retq
    604 ;
    605 ; AVX512F-LABEL: trunc8i32_8i8:
    606 ; AVX512F:       # %bb.0: # %entry
    607 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    608 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    609 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    610 ; AVX512F-NEXT:    vmovq %xmm0, (%rax)
    611 ; AVX512F-NEXT:    vzeroupper
    612 ; AVX512F-NEXT:    retq
    613 ;
    614 ; AVX512VL-LABEL: trunc8i32_8i8:
    615 ; AVX512VL:       # %bb.0: # %entry
    616 ; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
    617 ; AVX512VL-NEXT:    vzeroupper
    618 ; AVX512VL-NEXT:    retq
    619 ;
    620 ; AVX512BW-LABEL: trunc8i32_8i8:
    621 ; AVX512BW:       # %bb.0: # %entry
    622 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    623 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
    624 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    625 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
    626 ; AVX512BW-NEXT:    vzeroupper
    627 ; AVX512BW-NEXT:    retq
    628 ;
    629 ; AVX512BWVL-LABEL: trunc8i32_8i8:
    630 ; AVX512BWVL:       # %bb.0: # %entry
    631 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
    632 ; AVX512BWVL-NEXT:    vzeroupper
    633 ; AVX512BWVL-NEXT:    retq
    634 entry:
    635   %0 = trunc <8 x i32> %a to <8 x i8>
    636   store <8 x i8> %0, <8 x i8>* undef, align 4
    637   ret void
    638 }
    639 
    640 define void @trunc16i32_16i16(<16 x i32> %a) {
    641 ; SSE2-LABEL: trunc16i32_16i16:
    642 ; SSE2:       # %bb.0: # %entry
    643 ; SSE2-NEXT:    pslld $16, %xmm1
    644 ; SSE2-NEXT:    psrad $16, %xmm1
    645 ; SSE2-NEXT:    pslld $16, %xmm0
    646 ; SSE2-NEXT:    psrad $16, %xmm0
    647 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
    648 ; SSE2-NEXT:    pslld $16, %xmm3
    649 ; SSE2-NEXT:    psrad $16, %xmm3
    650 ; SSE2-NEXT:    pslld $16, %xmm2
    651 ; SSE2-NEXT:    psrad $16, %xmm2
    652 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
    653 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
    654 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    655 ; SSE2-NEXT:    retq
    656 ;
    657 ; SSSE3-LABEL: trunc16i32_16i16:
    658 ; SSSE3:       # %bb.0: # %entry
    659 ; SSSE3-NEXT:    pslld $16, %xmm1
    660 ; SSSE3-NEXT:    psrad $16, %xmm1
    661 ; SSSE3-NEXT:    pslld $16, %xmm0
    662 ; SSSE3-NEXT:    psrad $16, %xmm0
    663 ; SSSE3-NEXT:    packssdw %xmm1, %xmm0
    664 ; SSSE3-NEXT:    pslld $16, %xmm3
    665 ; SSSE3-NEXT:    psrad $16, %xmm3
    666 ; SSSE3-NEXT:    pslld $16, %xmm2
    667 ; SSSE3-NEXT:    psrad $16, %xmm2
    668 ; SSSE3-NEXT:    packssdw %xmm3, %xmm2
    669 ; SSSE3-NEXT:    movdqu %xmm2, (%rax)
    670 ; SSSE3-NEXT:    movdqu %xmm0, (%rax)
    671 ; SSSE3-NEXT:    retq
    672 ;
    673 ; SSE41-LABEL: trunc16i32_16i16:
    674 ; SSE41:       # %bb.0: # %entry
    675 ; SSE41-NEXT:    pxor %xmm4, %xmm4
    676 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
    677 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
    678 ; SSE41-NEXT:    packusdw %xmm1, %xmm0
    679 ; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
    680 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
    681 ; SSE41-NEXT:    packusdw %xmm3, %xmm2
    682 ; SSE41-NEXT:    movdqu %xmm2, (%rax)
    683 ; SSE41-NEXT:    movdqu %xmm0, (%rax)
    684 ; SSE41-NEXT:    retq
    685 ;
    686 ; AVX1-LABEL: trunc16i32_16i16:
    687 ; AVX1:       # %bb.0: # %entry
    688 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    689 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    690 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
    691 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
    692 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    693 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    694 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
    695 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
    696 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    697 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    698 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    699 ; AVX1-NEXT:    vzeroupper
    700 ; AVX1-NEXT:    retq
    701 ;
    702 ; AVX2-LABEL: trunc16i32_16i16:
    703 ; AVX2:       # %bb.0: # %entry
    704 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    705 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
    706 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    707 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
    708 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    709 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    710 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    711 ; AVX2-NEXT:    vzeroupper
    712 ; AVX2-NEXT:    retq
    713 ;
    714 ; AVX512-LABEL: trunc16i32_16i16:
    715 ; AVX512:       # %bb.0: # %entry
    716 ; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
    717 ; AVX512-NEXT:    vzeroupper
    718 ; AVX512-NEXT:    retq
    719 entry:
    720   %0 = trunc <16 x i32> %a to <16 x i16>
    721   store <16 x i16> %0, <16 x i16>* undef, align 4
    722   ret void
    723 }
    724 
    725 define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
    726 ; SSE-LABEL: trunc16i32_16i16_ashr:
    727 ; SSE:       # %bb.0: # %entry
    728 ; SSE-NEXT:    psrad $16, %xmm3
    729 ; SSE-NEXT:    psrad $16, %xmm2
    730 ; SSE-NEXT:    packssdw %xmm3, %xmm2
    731 ; SSE-NEXT:    psrad $16, %xmm1
    732 ; SSE-NEXT:    psrad $16, %xmm0
    733 ; SSE-NEXT:    packssdw %xmm1, %xmm0
    734 ; SSE-NEXT:    movdqu %xmm2, (%rax)
    735 ; SSE-NEXT:    movdqu %xmm0, (%rax)
    736 ; SSE-NEXT:    retq
    737 ;
    738 ; AVX1-LABEL: trunc16i32_16i16_ashr:
    739 ; AVX1:       # %bb.0: # %entry
    740 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    741 ; AVX1-NEXT:    vpsrad $16, %xmm2, %xmm2
    742 ; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
    743 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
    744 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    745 ; AVX1-NEXT:    vpsrad $16, %xmm2, %xmm2
    746 ; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
    747 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
    748 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    749 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    750 ; AVX1-NEXT:    vzeroupper
    751 ; AVX1-NEXT:    retq
    752 ;
    753 ; AVX2-LABEL: trunc16i32_16i16_ashr:
    754 ; AVX2:       # %bb.0: # %entry
    755 ; AVX2-NEXT:    vpsrad $16, %ymm1, %ymm1
    756 ; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
    757 ; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
    758 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
    759 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    760 ; AVX2-NEXT:    vzeroupper
    761 ; AVX2-NEXT:    retq
    762 ;
    763 ; AVX512-LABEL: trunc16i32_16i16_ashr:
    764 ; AVX512:       # %bb.0: # %entry
    765 ; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
    766 ; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
    767 ; AVX512-NEXT:    vzeroupper
    768 ; AVX512-NEXT:    retq
    769 entry:
    770   %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    771   %1 = trunc <16 x i32> %0 to <16 x i16>
    772   store <16 x i16> %1, <16 x i16>* undef, align 4
    773   ret void
    774 }
    775 
    776 define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
    777 ; SSE2-LABEL: trunc16i32_16i16_lshr:
    778 ; SSE2:       # %bb.0: # %entry
    779 ; SSE2-NEXT:    psrld $16, %xmm2
    780 ; SSE2-NEXT:    psrld $16, %xmm3
    781 ; SSE2-NEXT:    psrld $16, %xmm0
    782 ; SSE2-NEXT:    psrld $16, %xmm1
    783 ; SSE2-NEXT:    pslld $16, %xmm1
    784 ; SSE2-NEXT:    psrad $16, %xmm1
    785 ; SSE2-NEXT:    pslld $16, %xmm0
    786 ; SSE2-NEXT:    psrad $16, %xmm0
    787 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
    788 ; SSE2-NEXT:    pslld $16, %xmm3
    789 ; SSE2-NEXT:    psrad $16, %xmm3
    790 ; SSE2-NEXT:    pslld $16, %xmm2
    791 ; SSE2-NEXT:    psrad $16, %xmm2
    792 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
    793 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
    794 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    795 ; SSE2-NEXT:    retq
    796 ;
    797 ; SSSE3-LABEL: trunc16i32_16i16_lshr:
    798 ; SSSE3:       # %bb.0: # %entry
    799 ; SSSE3-NEXT:    psrld $16, %xmm2
    800 ; SSSE3-NEXT:    psrld $16, %xmm3
    801 ; SSSE3-NEXT:    psrld $16, %xmm0
    802 ; SSSE3-NEXT:    psrld $16, %xmm1
    803 ; SSSE3-NEXT:    pslld $16, %xmm1
    804 ; SSSE3-NEXT:    psrad $16, %xmm1
    805 ; SSSE3-NEXT:    pslld $16, %xmm0
    806 ; SSSE3-NEXT:    psrad $16, %xmm0
    807 ; SSSE3-NEXT:    packssdw %xmm1, %xmm0
    808 ; SSSE3-NEXT:    pslld $16, %xmm3
    809 ; SSSE3-NEXT:    psrad $16, %xmm3
    810 ; SSSE3-NEXT:    pslld $16, %xmm2
    811 ; SSSE3-NEXT:    psrad $16, %xmm2
    812 ; SSSE3-NEXT:    packssdw %xmm3, %xmm2
    813 ; SSSE3-NEXT:    movdqu %xmm2, (%rax)
    814 ; SSSE3-NEXT:    movdqu %xmm0, (%rax)
    815 ; SSSE3-NEXT:    retq
    816 ;
    817 ; SSE41-LABEL: trunc16i32_16i16_lshr:
    818 ; SSE41:       # %bb.0: # %entry
    819 ; SSE41-NEXT:    psrld $16, %xmm3
    820 ; SSE41-NEXT:    psrld $16, %xmm2
    821 ; SSE41-NEXT:    packusdw %xmm3, %xmm2
    822 ; SSE41-NEXT:    psrld $16, %xmm1
    823 ; SSE41-NEXT:    psrld $16, %xmm0
    824 ; SSE41-NEXT:    packusdw %xmm1, %xmm0
    825 ; SSE41-NEXT:    movdqu %xmm2, (%rax)
    826 ; SSE41-NEXT:    movdqu %xmm0, (%rax)
    827 ; SSE41-NEXT:    retq
    828 ;
    829 ; AVX1-LABEL: trunc16i32_16i16_lshr:
    830 ; AVX1:       # %bb.0: # %entry
    831 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    832 ; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
    833 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
    834 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    835 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    836 ; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
    837 ; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
    838 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    839 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    840 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
    841 ; AVX1-NEXT:    vzeroupper
    842 ; AVX1-NEXT:    retq
    843 ;
    844 ; AVX2-LABEL: trunc16i32_16i16_lshr:
    845 ; AVX2:       # %bb.0: # %entry
    846 ; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
    847 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
    848 ; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
    849 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
    850 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
    851 ; AVX2-NEXT:    vzeroupper
    852 ; AVX2-NEXT:    retq
    853 ;
    854 ; AVX512-LABEL: trunc16i32_16i16_lshr:
    855 ; AVX512:       # %bb.0: # %entry
    856 ; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
    857 ; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
    858 ; AVX512-NEXT:    vzeroupper
    859 ; AVX512-NEXT:    retq
    860 entry:
    861   %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
    862   %1 = trunc <16 x i32> %0 to <16 x i16>
    863   store <16 x i16> %1, <16 x i16>* undef, align 4
    864   ret void
    865 }
    866 
    867 define void @trunc16i32_16i8(<16 x i32> %a) {
    868 ; SSE2-LABEL: trunc16i32_16i8:
    869 ; SSE2:       # %bb.0: # %entry
    870 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    871 ; SSE2-NEXT:    pand %xmm4, %xmm3
    872 ; SSE2-NEXT:    pand %xmm4, %xmm2
    873 ; SSE2-NEXT:    packuswb %xmm3, %xmm2
    874 ; SSE2-NEXT:    pand %xmm4, %xmm1
    875 ; SSE2-NEXT:    pand %xmm4, %xmm0
    876 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
    877 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    878 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
    879 ; SSE2-NEXT:    retq
    880 ;
    881 ; SSSE3-LABEL: trunc16i32_16i8:
    882 ; SSSE3:       # %bb.0: # %entry
    883 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    884 ; SSSE3-NEXT:    pand %xmm4, %xmm3
    885 ; SSSE3-NEXT:    pand %xmm4, %xmm2
    886 ; SSSE3-NEXT:    packuswb %xmm3, %xmm2
    887 ; SSSE3-NEXT:    pand %xmm4, %xmm1
    888 ; SSSE3-NEXT:    pand %xmm4, %xmm0
    889 ; SSSE3-NEXT:    packuswb %xmm1, %xmm0
    890 ; SSSE3-NEXT:    packuswb %xmm2, %xmm0
    891 ; SSSE3-NEXT:    movdqu %xmm0, (%rax)
    892 ; SSSE3-NEXT:    retq
    893 ;
    894 ; SSE41-LABEL: trunc16i32_16i8:
    895 ; SSE41:       # %bb.0: # %entry
    896 ; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    897 ; SSE41-NEXT:    pand %xmm4, %xmm3
    898 ; SSE41-NEXT:    pand %xmm4, %xmm2
    899 ; SSE41-NEXT:    packusdw %xmm3, %xmm2
    900 ; SSE41-NEXT:    pand %xmm4, %xmm1
    901 ; SSE41-NEXT:    pand %xmm4, %xmm0
    902 ; SSE41-NEXT:    packusdw %xmm1, %xmm0
    903 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
    904 ; SSE41-NEXT:    movdqu %xmm0, (%rax)
    905 ; SSE41-NEXT:    retq
    906 ;
    907 ; AVX1-LABEL: trunc16i32_16i8:
    908 ; AVX1:       # %bb.0: # %entry
    909 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    910 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    911 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    912 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    913 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    914 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    915 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    916 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    917 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    918 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
    919 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
    920 ; AVX1-NEXT:    vzeroupper
    921 ; AVX1-NEXT:    retq
    922 ;
    923 ; AVX2-LABEL: trunc16i32_16i8:
    924 ; AVX2:       # %bb.0: # %entry
    925 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    926 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
    927 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    928 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    929 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    930 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
    931 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    932 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    933 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    934 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
    935 ; AVX2-NEXT:    vzeroupper
    936 ; AVX2-NEXT:    retq
    937 ;
    938 ; AVX512-LABEL: trunc16i32_16i8:
    939 ; AVX512:       # %bb.0: # %entry
    940 ; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
    941 ; AVX512-NEXT:    vzeroupper
    942 ; AVX512-NEXT:    retq
    943 entry:
    944   %0 = trunc <16 x i32> %a to <16 x i8>
    945   store <16 x i8> %0, <16 x i8>* undef, align 4
    946   ret void
    947 }
    948 
    949 define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
    950 ; SSE-LABEL: trunc16i32_16i8_ashr:
    951 ; SSE:       # %bb.0: # %entry
    952 ; SSE-NEXT:    psrad $24, %xmm1
    953 ; SSE-NEXT:    psrad $24, %xmm0
    954 ; SSE-NEXT:    packssdw %xmm1, %xmm0
    955 ; SSE-NEXT:    psrad $24, %xmm3
    956 ; SSE-NEXT:    psrad $24, %xmm2
    957 ; SSE-NEXT:    packssdw %xmm3, %xmm2
    958 ; SSE-NEXT:    packsswb %xmm2, %xmm0
    959 ; SSE-NEXT:    movdqu %xmm0, (%rax)
    960 ; SSE-NEXT:    retq
    961 ;
    962 ; AVX1-LABEL: trunc16i32_16i8_ashr:
    963 ; AVX1:       # %bb.0: # %entry
    964 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    965 ; AVX1-NEXT:    vpsrad $24, %xmm2, %xmm2
    966 ; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
    967 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
    968 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    969 ; AVX1-NEXT:    vpsrad $24, %xmm2, %xmm2
    970 ; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
    971 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
    972 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
    973 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
    974 ; AVX1-NEXT:    vzeroupper
    975 ; AVX1-NEXT:    retq
    976 ;
    977 ; AVX2-LABEL: trunc16i32_16i8_ashr:
    978 ; AVX2:       # %bb.0: # %entry
    979 ; AVX2-NEXT:    vpsrad $24, %ymm1, %ymm1
    980 ; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
    981 ; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
    982 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
    983 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    984 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
    985 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
    986 ; AVX2-NEXT:    vzeroupper
    987 ; AVX2-NEXT:    retq
    988 ;
    989 ; AVX512-LABEL: trunc16i32_16i8_ashr:
    990 ; AVX512:       # %bb.0: # %entry
    991 ; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
    992 ; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
    993 ; AVX512-NEXT:    vzeroupper
    994 ; AVX512-NEXT:    retq
    995 entry:
    996   %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
    997   %1 = trunc <16 x i32> %0 to <16 x i8>
    998   store <16 x i8> %1, <16 x i8>* undef, align 4
    999   ret void
   1000 }
   1001 
   1002 define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
   1003 ; SSE2-LABEL: trunc16i32_16i8_lshr:
   1004 ; SSE2:       # %bb.0: # %entry
   1005 ; SSE2-NEXT:    psrld $24, %xmm1
   1006 ; SSE2-NEXT:    psrld $24, %xmm0
   1007 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
   1008 ; SSE2-NEXT:    psrld $24, %xmm3
   1009 ; SSE2-NEXT:    psrld $24, %xmm2
   1010 ; SSE2-NEXT:    packuswb %xmm3, %xmm2
   1011 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
   1012 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1013 ; SSE2-NEXT:    retq
   1014 ;
   1015 ; SSSE3-LABEL: trunc16i32_16i8_lshr:
   1016 ; SSSE3:       # %bb.0: # %entry
   1017 ; SSSE3-NEXT:    psrld $24, %xmm1
   1018 ; SSSE3-NEXT:    psrld $24, %xmm0
   1019 ; SSSE3-NEXT:    packuswb %xmm1, %xmm0
   1020 ; SSSE3-NEXT:    psrld $24, %xmm3
   1021 ; SSSE3-NEXT:    psrld $24, %xmm2
   1022 ; SSSE3-NEXT:    packuswb %xmm3, %xmm2
   1023 ; SSSE3-NEXT:    packuswb %xmm2, %xmm0
   1024 ; SSSE3-NEXT:    movdqu %xmm0, (%rax)
   1025 ; SSSE3-NEXT:    retq
   1026 ;
   1027 ; SSE41-LABEL: trunc16i32_16i8_lshr:
   1028 ; SSE41:       # %bb.0: # %entry
   1029 ; SSE41-NEXT:    psrld $24, %xmm1
   1030 ; SSE41-NEXT:    psrld $24, %xmm0
   1031 ; SSE41-NEXT:    packusdw %xmm1, %xmm0
   1032 ; SSE41-NEXT:    psrld $24, %xmm3
   1033 ; SSE41-NEXT:    psrld $24, %xmm2
   1034 ; SSE41-NEXT:    packusdw %xmm3, %xmm2
   1035 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
   1036 ; SSE41-NEXT:    movdqu %xmm0, (%rax)
   1037 ; SSE41-NEXT:    retq
   1038 ;
   1039 ; AVX1-LABEL: trunc16i32_16i8_lshr:
   1040 ; AVX1:       # %bb.0: # %entry
   1041 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1042 ; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
   1043 ; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
   1044 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   1045 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1046 ; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
   1047 ; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
   1048 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   1049 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   1050 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
   1051 ; AVX1-NEXT:    vzeroupper
   1052 ; AVX1-NEXT:    retq
   1053 ;
   1054 ; AVX2-LABEL: trunc16i32_16i8_lshr:
   1055 ; AVX2:       # %bb.0: # %entry
   1056 ; AVX2-NEXT:    vpsrld $24, %ymm1, %ymm1
   1057 ; AVX2-NEXT:    vpsrld $24, %ymm0, %ymm0
   1058 ; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
   1059 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
   1060 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1061 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   1062 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
   1063 ; AVX2-NEXT:    vzeroupper
   1064 ; AVX2-NEXT:    retq
   1065 ;
   1066 ; AVX512-LABEL: trunc16i32_16i8_lshr:
   1067 ; AVX512:       # %bb.0: # %entry
   1068 ; AVX512-NEXT:    vpsrld $24, %zmm0, %zmm0
   1069 ; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
   1070 ; AVX512-NEXT:    vzeroupper
   1071 ; AVX512-NEXT:    retq
   1072 entry:
   1073   %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
   1074   %1 = trunc <16 x i32> %0 to <16 x i8>
   1075   store <16 x i8> %1, <16 x i8>* undef, align 4
   1076   ret void
   1077 }
   1078 
   1079 ;PR25684
   1080 define void @trunc16i16_16i8(<16 x i16> %a) {
   1081 ; SSE2-LABEL: trunc16i16_16i8:
   1082 ; SSE2:       # %bb.0: # %entry
   1083 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   1084 ; SSE2-NEXT:    pand %xmm2, %xmm1
   1085 ; SSE2-NEXT:    pand %xmm2, %xmm0
   1086 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
   1087 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1088 ; SSE2-NEXT:    retq
   1089 ;
   1090 ; SSSE3-LABEL: trunc16i16_16i8:
   1091 ; SSSE3:       # %bb.0: # %entry
   1092 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1093 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
   1094 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
   1095 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1096 ; SSSE3-NEXT:    movdqu %xmm0, (%rax)
   1097 ; SSSE3-NEXT:    retq
   1098 ;
   1099 ; SSE41-LABEL: trunc16i16_16i8:
   1100 ; SSE41:       # %bb.0: # %entry
   1101 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1102 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
   1103 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
   1104 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1105 ; SSE41-NEXT:    movdqu %xmm0, (%rax)
   1106 ; SSE41-NEXT:    retq
   1107 ;
   1108 ; AVX1-LABEL: trunc16i16_16i8:
   1109 ; AVX1:       # %bb.0: # %entry
   1110 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1111 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1112 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1113 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1114 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1115 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
   1116 ; AVX1-NEXT:    vzeroupper
   1117 ; AVX1-NEXT:    retq
   1118 ;
   1119 ; AVX2-LABEL: trunc16i16_16i8:
   1120 ; AVX2:       # %bb.0: # %entry
   1121 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1122 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1123 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1124 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1125 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1126 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
   1127 ; AVX2-NEXT:    vzeroupper
   1128 ; AVX2-NEXT:    retq
   1129 ;
   1130 ; AVX512F-LABEL: trunc16i16_16i8:
   1131 ; AVX512F:       # %bb.0: # %entry
   1132 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   1133 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
   1134 ; AVX512F-NEXT:    vzeroupper
   1135 ; AVX512F-NEXT:    retq
   1136 ;
   1137 ; AVX512VL-LABEL: trunc16i16_16i8:
   1138 ; AVX512VL:       # %bb.0: # %entry
   1139 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1140 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
   1141 ; AVX512VL-NEXT:    vzeroupper
   1142 ; AVX512VL-NEXT:    retq
   1143 ;
   1144 ; AVX512BW-LABEL: trunc16i16_16i8:
   1145 ; AVX512BW:       # %bb.0: # %entry
   1146 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1147 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1148 ; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
   1149 ; AVX512BW-NEXT:    vzeroupper
   1150 ; AVX512BW-NEXT:    retq
   1151 ;
   1152 ; AVX512BWVL-LABEL: trunc16i16_16i8:
   1153 ; AVX512BWVL:       # %bb.0: # %entry
   1154 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
   1155 ; AVX512BWVL-NEXT:    vzeroupper
   1156 ; AVX512BWVL-NEXT:    retq
   1157 entry:
   1158   %0 = trunc <16 x i16> %a to <16 x i8>
   1159   store <16 x i8> %0, <16 x i8>* undef, align 4
   1160   ret void
   1161 }
   1162 
   1163 define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
   1164 ; SSE-LABEL: trunc16i16_16i8_ashr:
   1165 ; SSE:       # %bb.0: # %entry
   1166 ; SSE-NEXT:    psraw $8, %xmm1
   1167 ; SSE-NEXT:    psraw $8, %xmm0
   1168 ; SSE-NEXT:    packsswb %xmm1, %xmm0
   1169 ; SSE-NEXT:    movdqu %xmm0, (%rax)
   1170 ; SSE-NEXT:    retq
   1171 ;
   1172 ; AVX1-LABEL: trunc16i16_16i8_ashr:
   1173 ; AVX1:       # %bb.0: # %entry
   1174 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1175 ; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
   1176 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
   1177 ; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
   1178 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
   1179 ; AVX1-NEXT:    vzeroupper
   1180 ; AVX1-NEXT:    retq
   1181 ;
   1182 ; AVX2-LABEL: trunc16i16_16i8_ashr:
   1183 ; AVX2:       # %bb.0: # %entry
   1184 ; AVX2-NEXT:    vpsraw $8, %ymm0, %ymm0
   1185 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1186 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
   1187 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
   1188 ; AVX2-NEXT:    vzeroupper
   1189 ; AVX2-NEXT:    retq
   1190 ;
   1191 ; AVX512F-LABEL: trunc16i16_16i8_ashr:
   1192 ; AVX512F:       # %bb.0: # %entry
   1193 ; AVX512F-NEXT:    vpsraw $8, %ymm0, %ymm0
   1194 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   1195 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
   1196 ; AVX512F-NEXT:    vzeroupper
   1197 ; AVX512F-NEXT:    retq
   1198 ;
   1199 ; AVX512VL-LABEL: trunc16i16_16i8_ashr:
   1200 ; AVX512VL:       # %bb.0: # %entry
   1201 ; AVX512VL-NEXT:    vpsraw $8, %ymm0, %ymm0
   1202 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1203 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
   1204 ; AVX512VL-NEXT:    vzeroupper
   1205 ; AVX512VL-NEXT:    retq
   1206 ;
   1207 ; AVX512BW-LABEL: trunc16i16_16i8_ashr:
   1208 ; AVX512BW:       # %bb.0: # %entry
   1209 ; AVX512BW-NEXT:    vpsraw $8, %ymm0, %ymm0
   1210 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1211 ; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
   1212 ; AVX512BW-NEXT:    vzeroupper
   1213 ; AVX512BW-NEXT:    retq
   1214 ;
   1215 ; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
   1216 ; AVX512BWVL:       # %bb.0: # %entry
   1217 ; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
   1218 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
   1219 ; AVX512BWVL-NEXT:    vzeroupper
   1220 ; AVX512BWVL-NEXT:    retq
   1221 entry:
   1222   %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   1223   %1 = trunc <16 x i16> %0 to <16 x i8>
   1224   store <16 x i8> %1, <16 x i8>* undef, align 4
   1225   ret void
   1226 }
   1227 
   1228 define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
   1229 ; SSE-LABEL: trunc16i16_16i8_lshr:
   1230 ; SSE:       # %bb.0: # %entry
   1231 ; SSE-NEXT:    psrlw $8, %xmm1
   1232 ; SSE-NEXT:    psrlw $8, %xmm0
   1233 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1234 ; SSE-NEXT:    movdqu %xmm0, (%rax)
   1235 ; SSE-NEXT:    retq
   1236 ;
   1237 ; AVX1-LABEL: trunc16i16_16i8_lshr:
   1238 ; AVX1:       # %bb.0: # %entry
   1239 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1240 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
   1241 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
   1242 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   1243 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
   1244 ; AVX1-NEXT:    vzeroupper
   1245 ; AVX1-NEXT:    retq
   1246 ;
   1247 ; AVX2-LABEL: trunc16i16_16i8_lshr:
   1248 ; AVX2:       # %bb.0: # %entry
   1249 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
   1250 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1251 ; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   1252 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)
   1253 ; AVX2-NEXT:    vzeroupper
   1254 ; AVX2-NEXT:    retq
   1255 ;
   1256 ; AVX512F-LABEL: trunc16i16_16i8_lshr:
   1257 ; AVX512F:       # %bb.0: # %entry
   1258 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
   1259 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   1260 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rax)
   1261 ; AVX512F-NEXT:    vzeroupper
   1262 ; AVX512F-NEXT:    retq
   1263 ;
   1264 ; AVX512VL-LABEL: trunc16i16_16i8_lshr:
   1265 ; AVX512VL:       # %bb.0: # %entry
   1266 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
   1267 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1268 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rax)
   1269 ; AVX512VL-NEXT:    vzeroupper
   1270 ; AVX512VL-NEXT:    retq
   1271 ;
   1272 ; AVX512BW-LABEL: trunc16i16_16i8_lshr:
   1273 ; AVX512BW:       # %bb.0: # %entry
   1274 ; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
   1275 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1276 ; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
   1277 ; AVX512BW-NEXT:    vzeroupper
   1278 ; AVX512BW-NEXT:    retq
   1279 ;
   1280 ; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
   1281 ; AVX512BWVL:       # %bb.0: # %entry
   1282 ; AVX512BWVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
   1283 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
   1284 ; AVX512BWVL-NEXT:    vzeroupper
   1285 ; AVX512BWVL-NEXT:    retq
   1286 entry:
   1287   %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   1288   %1 = trunc <16 x i16> %0 to <16 x i8>
   1289   store <16 x i8> %1, <16 x i8>* undef, align 4
   1290   ret void
   1291 }
   1292 
   1293 define void @trunc32i16_32i8(<32 x i16> %a) {
   1294 ; SSE2-LABEL: trunc32i16_32i8:
   1295 ; SSE2:       # %bb.0: # %entry
   1296 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
   1297 ; SSE2-NEXT:    pand %xmm4, %xmm1
   1298 ; SSE2-NEXT:    pand %xmm4, %xmm0
   1299 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
   1300 ; SSE2-NEXT:    pand %xmm4, %xmm3
   1301 ; SSE2-NEXT:    pand %xmm4, %xmm2
   1302 ; SSE2-NEXT:    packuswb %xmm3, %xmm2
   1303 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
   1304 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
   1305 ; SSE2-NEXT:    retq
   1306 ;
   1307 ; SSSE3-LABEL: trunc32i16_32i8:
   1308 ; SSSE3:       # %bb.0: # %entry
   1309 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1310 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
   1311 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
   1312 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1313 ; SSSE3-NEXT:    pshufb %xmm4, %xmm3
   1314 ; SSSE3-NEXT:    pshufb %xmm4, %xmm2
   1315 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1316 ; SSSE3-NEXT:    movdqu %xmm2, (%rax)
   1317 ; SSSE3-NEXT:    movdqu %xmm0, (%rax)
   1318 ; SSSE3-NEXT:    retq
   1319 ;
   1320 ; SSE41-LABEL: trunc32i16_32i8:
   1321 ; SSE41:       # %bb.0: # %entry
   1322 ; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1323 ; SSE41-NEXT:    pshufb %xmm4, %xmm1
   1324 ; SSE41-NEXT:    pshufb %xmm4, %xmm0
   1325 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1326 ; SSE41-NEXT:    pshufb %xmm4, %xmm3
   1327 ; SSE41-NEXT:    pshufb %xmm4, %xmm2
   1328 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   1329 ; SSE41-NEXT:    movdqu %xmm2, (%rax)
   1330 ; SSE41-NEXT:    movdqu %xmm0, (%rax)
   1331 ; SSE41-NEXT:    retq
   1332 ;
   1333 ; AVX1-LABEL: trunc32i16_32i8:
   1334 ; AVX1:       # %bb.0: # %entry
   1335 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1336 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1337 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1338 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1339 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1340 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1341 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1342 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1343 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1344 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1345 ; AVX1-NEXT:    vmovups %ymm0, (%rax)
   1346 ; AVX1-NEXT:    vzeroupper
   1347 ; AVX1-NEXT:    retq
   1348 ;
   1349 ; AVX2-LABEL: trunc32i16_32i8:
   1350 ; AVX2:       # %bb.0: # %entry
   1351 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1352 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1353 ; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1354 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1355 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1356 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1357 ; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1358 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1359 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1360 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1361 ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
   1362 ; AVX2-NEXT:    vzeroupper
   1363 ; AVX2-NEXT:    retq
   1364 ;
   1365 ; AVX512F-LABEL: trunc32i16_32i8:
   1366 ; AVX512F:       # %bb.0: # %entry
   1367 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   1368 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   1369 ; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
   1370 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
   1371 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1372 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
   1373 ; AVX512F-NEXT:    vzeroupper
   1374 ; AVX512F-NEXT:    retq
   1375 ;
   1376 ; AVX512VL-LABEL: trunc32i16_32i8:
   1377 ; AVX512VL:       # %bb.0: # %entry
   1378 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1379 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
   1380 ; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
   1381 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
   1382 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1383 ; AVX512VL-NEXT:    vmovdqu %ymm0, (%rax)
   1384 ; AVX512VL-NEXT:    vzeroupper
   1385 ; AVX512VL-NEXT:    retq
   1386 ;
   1387 ; AVX512BW-LABEL: trunc32i16_32i8:
   1388 ; AVX512BW:       # %bb.0: # %entry
   1389 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
   1390 ; AVX512BW-NEXT:    vzeroupper
   1391 ; AVX512BW-NEXT:    retq
   1392 ;
   1393 ; AVX512BWVL-LABEL: trunc32i16_32i8:
   1394 ; AVX512BWVL:       # %bb.0: # %entry
   1395 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
   1396 ; AVX512BWVL-NEXT:    vzeroupper
   1397 ; AVX512BWVL-NEXT:    retq
   1398 entry:
   1399   %0 = trunc <32 x i16> %a to <32 x i8>
   1400   store <32 x i8> %0, <32 x i8>* undef, align 4
   1401   ret void
   1402 }
   1403 
   1404 define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
   1405 ; SSE-LABEL: trunc2x4i64_8i32:
   1406 ; SSE:       # %bb.0: # %entry
   1407 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1408 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
   1409 ; SSE-NEXT:    movaps %xmm2, %xmm1
   1410 ; SSE-NEXT:    retq
   1411 ;
   1412 ; AVX1-LABEL: trunc2x4i64_8i32:
   1413 ; AVX1:       # %bb.0: # %entry
   1414 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1415 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
   1416 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1417 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
   1418 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1419 ; AVX1-NEXT:    retq
   1420 ;
   1421 ; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
   1422 ; AVX2-SLOW:       # %bb.0: # %entry
   1423 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1424 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1425 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   1426 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1427 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1428 ; AVX2-SLOW-NEXT:    retq
   1429 ;
   1430 ; AVX2-FAST-LABEL: trunc2x4i64_8i32:
   1431 ; AVX2-FAST:       # %bb.0: # %entry
   1432 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   1433 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
   1434 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
   1435 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1436 ; AVX2-FAST-NEXT:    retq
   1437 ;
   1438 ; AVX512F-LABEL: trunc2x4i64_8i32:
   1439 ; AVX512F:       # %bb.0: # %entry
   1440 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   1441 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1442 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
   1443 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
   1444 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1445 ; AVX512F-NEXT:    retq
   1446 ;
   1447 ; AVX512VL-LABEL: trunc2x4i64_8i32:
   1448 ; AVX512VL:       # %bb.0: # %entry
   1449 ; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
   1450 ; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
   1451 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1452 ; AVX512VL-NEXT:    retq
   1453 ;
   1454 ; AVX512BW-LABEL: trunc2x4i64_8i32:
   1455 ; AVX512BW:       # %bb.0: # %entry
   1456 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   1457 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1458 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
   1459 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
   1460 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1461 ; AVX512BW-NEXT:    retq
   1462 ;
   1463 ; AVX512BWVL-LABEL: trunc2x4i64_8i32:
   1464 ; AVX512BWVL:       # %bb.0: # %entry
   1465 ; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
   1466 ; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
   1467 ; AVX512BWVL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1468 ; AVX512BWVL-NEXT:    retq
   1469 entry:
   1470   %0 = trunc <4 x i64> %a to <4 x i32>
   1471   %1 = trunc <4 x i64> %b to <4 x i32>
   1472   %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1473   ret <8 x i32> %2
   1474 }
   1475 
   1476 define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
   1477 ; SSE2-LABEL: trunc2x4i64_8i16:
   1478 ; SSE2:       # %bb.0: # %entry
   1479 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1480 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1481 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1482 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   1483 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   1484 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   1485 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   1486 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   1487 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   1488 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1489 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   1490 ; SSE2-NEXT:    retq
   1491 ;
   1492 ; SSSE3-LABEL: trunc2x4i64_8i16:
   1493 ; SSSE3:       # %bb.0: # %entry
   1494 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1495 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1496 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1497 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   1498 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   1499 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   1500 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   1501 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   1502 ; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   1503 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1504 ; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   1505 ; SSSE3-NEXT:    retq
   1506 ;
   1507 ; SSE41-LABEL: trunc2x4i64_8i16:
   1508 ; SSE41:       # %bb.0: # %entry
   1509 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
   1510 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
   1511 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
   1512 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
   1513 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   1514 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1515 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1516 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1517 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1518 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1519 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
   1520 ; SSE41-NEXT:    retq
   1521 ;
   1522 ; AVX1-LABEL: trunc2x4i64_8i16:
   1523 ; AVX1:       # %bb.0: # %entry
   1524 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1525 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
   1526 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1527 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
   1528 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1529 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1530 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1531 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1532 ; AVX1-NEXT:    vzeroupper
   1533 ; AVX1-NEXT:    retq
   1534 ;
   1535 ; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
   1536 ; AVX2-SLOW:       # %bb.0: # %entry
   1537 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1538 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1539 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   1540 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1541 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1542 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1543 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1544 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1545 ; AVX2-SLOW-NEXT:    vzeroupper
   1546 ; AVX2-SLOW-NEXT:    retq
   1547 ;
   1548 ; AVX2-FAST-LABEL: trunc2x4i64_8i16:
   1549 ; AVX2-FAST:       # %bb.0: # %entry
   1550 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   1551 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   1552 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   1553 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1554 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1555 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1556 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1557 ; AVX2-FAST-NEXT:    vzeroupper
   1558 ; AVX2-FAST-NEXT:    retq
   1559 ;
   1560 ; AVX512F-LABEL: trunc2x4i64_8i16:
   1561 ; AVX512F:       # %bb.0: # %entry
   1562 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   1563 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1564 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
   1565 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
   1566 ; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1567 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1568 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1569 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1570 ; AVX512F-NEXT:    vzeroupper
   1571 ; AVX512F-NEXT:    retq
   1572 ;
   1573 ; AVX512VL-LABEL: trunc2x4i64_8i16:
   1574 ; AVX512VL:       # %bb.0: # %entry
   1575 ; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
   1576 ; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1
   1577 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1578 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1579 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1580 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1581 ; AVX512VL-NEXT:    vzeroupper
   1582 ; AVX512VL-NEXT:    retq
   1583 ;
   1584 ; AVX512BW-LABEL: trunc2x4i64_8i16:
   1585 ; AVX512BW:       # %bb.0: # %entry
   1586 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   1587 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1588 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
   1589 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
   1590 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1591 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1592 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1593 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1594 ; AVX512BW-NEXT:    vzeroupper
   1595 ; AVX512BW-NEXT:    retq
   1596 ;
   1597 ; AVX512BWVL-LABEL: trunc2x4i64_8i16:
   1598 ; AVX512BWVL:       # %bb.0: # %entry
   1599 ; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0
   1600 ; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1
   1601 ; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1602 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1603 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1604 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1605 ; AVX512BWVL-NEXT:    vzeroupper
   1606 ; AVX512BWVL-NEXT:    retq
   1607 entry:
   1608   %0 = trunc <4 x i64> %a to <4 x i16>
   1609   %1 = trunc <4 x i64> %b to <4 x i16>
   1610   %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1611   ret <8 x i16> %2
   1612 }
   1613 
   1614 define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
   1615 ; SSE-LABEL: trunc2x2i64_4i32:
   1616 ; SSE:       # %bb.0: # %entry
   1617 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1618 ; SSE-NEXT:    retq
   1619 ;
   1620 ; AVX-LABEL: trunc2x2i64_4i32:
   1621 ; AVX:       # %bb.0: # %entry
   1622 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1623 ; AVX-NEXT:    retq
   1624 ;
   1625 ; AVX512-LABEL: trunc2x2i64_4i32:
   1626 ; AVX512:       # %bb.0: # %entry
   1627 ; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1628 ; AVX512-NEXT:    retq
   1629 entry:
   1630   %0 = trunc <2 x i64> %a to <2 x i32>
   1631   %1 = trunc <2 x i64> %b to <2 x i32>
   1632   %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   1633   ret <4 x i32> %2
   1634 }
   1635 
   1636 define i64 @trunc2i64_i64(<2 x i64> %inval) {
   1637 ; SSE-LABEL: trunc2i64_i64:
   1638 ; SSE:       # %bb.0: # %entry
   1639 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1640 ; SSE-NEXT:    movq %xmm0, %rax
   1641 ; SSE-NEXT:    retq
   1642 ;
   1643 ; AVX-LABEL: trunc2i64_i64:
   1644 ; AVX:       # %bb.0: # %entry
   1645 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1646 ; AVX-NEXT:    vmovq %xmm0, %rax
   1647 ; AVX-NEXT:    retq
   1648 ;
   1649 ; AVX512F-LABEL: trunc2i64_i64:
   1650 ; AVX512F:       # %bb.0: # %entry
   1651 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1652 ; AVX512F-NEXT:    vmovq %xmm0, %rax
   1653 ; AVX512F-NEXT:    retq
   1654 ;
   1655 ; AVX512VL-LABEL: trunc2i64_i64:
   1656 ; AVX512VL:       # %bb.0: # %entry
   1657 ; AVX512VL-NEXT:    vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
   1658 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1659 ; AVX512VL-NEXT:    retq
   1660 ;
   1661 ; AVX512BW-LABEL: trunc2i64_i64:
   1662 ; AVX512BW:       # %bb.0: # %entry
   1663 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1664 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
   1665 ; AVX512BW-NEXT:    retq
   1666 ;
   1667 ; AVX512BWVL-LABEL: trunc2i64_i64:
   1668 ; AVX512BWVL:       # %bb.0: # %entry
   1669 ; AVX512BWVL-NEXT:    vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
   1670 ; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1671 ; AVX512BWVL-NEXT:    retq
   1672 entry:
   1673   %0 = trunc <2 x i64> %inval to <2 x i32>
   1674   %1 = bitcast <2 x i32> %0 to i64
   1675   ret i64 %1
   1676 }
   1677 
   1678 define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
   1679 ; SSE2-LABEL: trunc2x4i32_8i16:
   1680 ; SSE2:       # %bb.0: # %entry
   1681 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1682 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
   1683 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1684 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1685 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
   1686 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1687 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1688 ; SSE2-NEXT:    retq
   1689 ;
   1690 ; SSSE3-LABEL: trunc2x4i32_8i16:
   1691 ; SSSE3:       # %bb.0: # %entry
   1692 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1693 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
   1694 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
   1695 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1696 ; SSSE3-NEXT:    retq
   1697 ;
   1698 ; SSE41-LABEL: trunc2x4i32_8i16:
   1699 ; SSE41:       # %bb.0: # %entry
   1700 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1701 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
   1702 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
   1703 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1704 ; SSE41-NEXT:    retq
   1705 ;
   1706 ; AVX-LABEL: trunc2x4i32_8i16:
   1707 ; AVX:       # %bb.0: # %entry
   1708 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1709 ; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1710 ; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1711 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1712 ; AVX-NEXT:    retq
   1713 ;
   1714 ; AVX512-LABEL: trunc2x4i32_8i16:
   1715 ; AVX512:       # %bb.0: # %entry
   1716 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1717 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1718 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1719 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1720 ; AVX512-NEXT:    retq
   1721 entry:
   1722   %0 = trunc <4 x i32> %a to <4 x i16>
   1723   %1 = trunc <4 x i32> %b to <4 x i16>
   1724   %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1725   ret <8 x i16> %2
   1726 }
   1727 
   1728 ; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
   1729 define i64 @trunc4i32_i64(<4 x i32> %inval) {
   1730 ; SSE2-LABEL: trunc4i32_i64:
   1731 ; SSE2:       # %bb.0: # %entry
   1732 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1733 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
   1734 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1735 ; SSE2-NEXT:    movq %xmm0, %rax
   1736 ; SSE2-NEXT:    retq
   1737 ;
   1738 ; SSSE3-LABEL: trunc4i32_i64:
   1739 ; SSSE3:       # %bb.0: # %entry
   1740 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1741 ; SSSE3-NEXT:    movq %xmm0, %rax
   1742 ; SSSE3-NEXT:    retq
   1743 ;
   1744 ; SSE41-LABEL: trunc4i32_i64:
   1745 ; SSE41:       # %bb.0: # %entry
   1746 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1747 ; SSE41-NEXT:    movq %xmm0, %rax
   1748 ; SSE41-NEXT:    retq
   1749 ;
   1750 ; AVX-LABEL: trunc4i32_i64:
   1751 ; AVX:       # %bb.0: # %entry
   1752 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1753 ; AVX-NEXT:    vmovq %xmm0, %rax
   1754 ; AVX-NEXT:    retq
   1755 ;
   1756 ; AVX512F-LABEL: trunc4i32_i64:
   1757 ; AVX512F:       # %bb.0: # %entry
   1758 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1759 ; AVX512F-NEXT:    vmovq %xmm0, %rax
   1760 ; AVX512F-NEXT:    retq
   1761 ;
   1762 ; AVX512VL-LABEL: trunc4i32_i64:
   1763 ; AVX512VL:       # %bb.0: # %entry
   1764 ; AVX512VL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
   1765 ; AVX512VL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1766 ; AVX512VL-NEXT:    retq
   1767 ;
   1768 ; AVX512BW-LABEL: trunc4i32_i64:
   1769 ; AVX512BW:       # %bb.0: # %entry
   1770 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1771 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
   1772 ; AVX512BW-NEXT:    retq
   1773 ;
   1774 ; AVX512BWVL-LABEL: trunc4i32_i64:
   1775 ; AVX512BWVL:       # %bb.0: # %entry
   1776 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
   1777 ; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1778 ; AVX512BWVL-NEXT:    retq
   1779 entry:
   1780   %0 = trunc <4 x i32> %inval to <4 x i16>
   1781   %1 = bitcast <4 x i16> %0 to i64
   1782   ret i64 %1
   1783 }
   1784 
   1785 define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
   1786 ; SSE2-LABEL: trunc2x8i16_16i8:
   1787 ; SSE2:       # %bb.0: # %entry
   1788 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   1789 ; SSE2-NEXT:    pand %xmm2, %xmm1
   1790 ; SSE2-NEXT:    pand %xmm2, %xmm0
   1791 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
   1792 ; SSE2-NEXT:    retq
   1793 ;
   1794 ; SSSE3-LABEL: trunc2x8i16_16i8:
   1795 ; SSSE3:       # %bb.0: # %entry
   1796 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1797 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
   1798 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
   1799 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1800 ; SSSE3-NEXT:    retq
   1801 ;
   1802 ; SSE41-LABEL: trunc2x8i16_16i8:
   1803 ; SSE41:       # %bb.0: # %entry
   1804 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1805 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
   1806 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
   1807 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1808 ; SSE41-NEXT:    retq
   1809 ;
   1810 ; AVX-LABEL: trunc2x8i16_16i8:
   1811 ; AVX:       # %bb.0: # %entry
   1812 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1813 ; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1814 ; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1815 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1816 ; AVX-NEXT:    retq
   1817 ;
   1818 ; AVX512-LABEL: trunc2x8i16_16i8:
   1819 ; AVX512:       # %bb.0: # %entry
   1820 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1821 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1822 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1823 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1824 ; AVX512-NEXT:    retq
   1825 entry:
   1826   %0 = trunc <8 x i16> %a to <8 x i8>
   1827   %1 = trunc <8 x i16> %b to <8 x i8>
   1828   %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1829   ret <16 x i8> %2
   1830 }
   1831 
   1832 ; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
   1833 define i64 @trunc8i16_i64(<8 x i16> %inval) {
   1834 ; SSE2-LABEL: trunc8i16_i64:
   1835 ; SSE2:       # %bb.0: # %entry
   1836 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1837 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
   1838 ; SSE2-NEXT:    movq %xmm0, %rax
   1839 ; SSE2-NEXT:    retq
   1840 ;
   1841 ; SSSE3-LABEL: trunc8i16_i64:
   1842 ; SSSE3:       # %bb.0: # %entry
   1843 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
   1844 ; SSSE3-NEXT:    movq %xmm0, %rax
   1845 ; SSSE3-NEXT:    retq
   1846 ;
   1847 ; SSE41-LABEL: trunc8i16_i64:
   1848 ; SSE41:       # %bb.0: # %entry
   1849 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
   1850 ; SSE41-NEXT:    movq %xmm0, %rax
   1851 ; SSE41-NEXT:    retq
   1852 ;
   1853 ; AVX-LABEL: trunc8i16_i64:
   1854 ; AVX:       # %bb.0: # %entry
   1855 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
   1856 ; AVX-NEXT:    vmovq %xmm0, %rax
   1857 ; AVX-NEXT:    retq
   1858 ;
   1859 ; AVX512F-LABEL: trunc8i16_i64:
   1860 ; AVX512F:       # %bb.0: # %entry
   1861 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
   1862 ; AVX512F-NEXT:    vmovq %xmm0, %rax
   1863 ; AVX512F-NEXT:    retq
   1864 ;
   1865 ; AVX512VL-LABEL: trunc8i16_i64:
   1866 ; AVX512VL:       # %bb.0: # %entry
   1867 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
   1868 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
   1869 ; AVX512VL-NEXT:    retq
   1870 ;
   1871 ; AVX512BW-LABEL: trunc8i16_i64:
   1872 ; AVX512BW:       # %bb.0: # %entry
   1873 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
   1874 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
   1875 ; AVX512BW-NEXT:    retq
   1876 ;
   1877 ; AVX512BWVL-LABEL: trunc8i16_i64:
   1878 ; AVX512BWVL:       # %bb.0: # %entry
   1879 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, -{{[0-9]+}}(%rsp)
   1880 ; AVX512BWVL-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1881 ; AVX512BWVL-NEXT:    retq
   1882 entry:
   1883   %0 = trunc <8 x i16> %inval to <8 x i8>
   1884   %1 = bitcast <8 x i8> %0 to i64
   1885   ret i64 %1
   1886 }
   1887 
   1888 define <16 x i8> @trunc16i64_16i8_const() {
   1889 ; SSE-LABEL: trunc16i64_16i8_const:
   1890 ; SSE:       # %bb.0: # %entry
   1891 ; SSE-NEXT:    xorps %xmm0, %xmm0
   1892 ; SSE-NEXT:    retq
   1893 ;
   1894 ; AVX-LABEL: trunc16i64_16i8_const:
   1895 ; AVX:       # %bb.0: # %entry
   1896 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1897 ; AVX-NEXT:    retq
   1898 ;
   1899 ; AVX512-LABEL: trunc16i64_16i8_const:
   1900 ; AVX512:       # %bb.0: # %entry
   1901 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   1902 ; AVX512-NEXT:    retq
   1903 
   1904 entry:
   1905   %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
   1906   %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
   1907   ret <16 x i8> %1
   1908 }
   1909 
   1910 define <8 x i16> @PR32160(<8 x i32> %x) {
   1911 ; SSE-LABEL: PR32160:
   1912 ; SSE:       # %bb.0:
   1913 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
   1914 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
   1915 ; SSE-NEXT:    retq
   1916 ;
   1917 ; AVX1-LABEL: PR32160:
   1918 ; AVX1:       # %bb.0:
   1919 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
   1920 ; AVX1-NEXT:    vzeroupper
   1921 ; AVX1-NEXT:    retq
   1922 ;
   1923 ; AVX2-SLOW-LABEL: PR32160:
   1924 ; AVX2-SLOW:       # %bb.0:
   1925 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1926 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1927 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
   1928 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
   1929 ; AVX2-SLOW-NEXT:    vzeroupper
   1930 ; AVX2-SLOW-NEXT:    retq
   1931 ;
   1932 ; AVX2-FAST-LABEL: PR32160:
   1933 ; AVX2-FAST:       # %bb.0:
   1934 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1935 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1936 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
   1937 ; AVX2-FAST-NEXT:    vzeroupper
   1938 ; AVX2-FAST-NEXT:    retq
   1939 ;
   1940 ; AVX512F-LABEL: PR32160:
   1941 ; AVX512F:       # %bb.0:
   1942 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1943 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
   1944 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
   1945 ; AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
   1946 ; AVX512F-NEXT:    vzeroupper
   1947 ; AVX512F-NEXT:    retq
   1948 ;
   1949 ; AVX512VL-LABEL: PR32160:
   1950 ; AVX512VL:       # %bb.0:
   1951 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
   1952 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
   1953 ; AVX512VL-NEXT:    vzeroupper
   1954 ; AVX512VL-NEXT:    retq
   1955 ;
   1956 ; AVX512BW-LABEL: PR32160:
   1957 ; AVX512BW:       # %bb.0:
   1958 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1959 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
   1960 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
   1961 ; AVX512BW-NEXT:    vzeroupper
   1962 ; AVX512BW-NEXT:    retq
   1963 ;
   1964 ; AVX512BWVL-LABEL: PR32160:
   1965 ; AVX512BWVL:       # %bb.0:
   1966 ; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
   1967 ; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
   1968 ; AVX512BWVL-NEXT:    vzeroupper
   1969 ; AVX512BWVL-NEXT:    retq
   1970   %shuf = trunc <8 x i32> %x to <8 x i16>
   1971   %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   1972   ret <8 x i16> %trunc
   1973 }
   1974 
   1975 define void @PR34773(i16* %a0, i8* %a1) {
   1976 ; SSE-LABEL: PR34773:
   1977 ; SSE:       # %bb.0:
   1978 ; SSE-NEXT:    movdqu (%rdi), %xmm0
   1979 ; SSE-NEXT:    movdqu 16(%rdi), %xmm1
   1980 ; SSE-NEXT:    movdqu 32(%rdi), %xmm2
   1981 ; SSE-NEXT:    movdqu 48(%rdi), %xmm3
   1982 ; SSE-NEXT:    psrlw $8, %xmm1
   1983 ; SSE-NEXT:    psrlw $8, %xmm0
   1984 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1985 ; SSE-NEXT:    psrlw $8, %xmm3
   1986 ; SSE-NEXT:    psrlw $8, %xmm2
   1987 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   1988 ; SSE-NEXT:    movdqu %xmm0, (%rsi)
   1989 ; SSE-NEXT:    movdqu %xmm2, 16(%rsi)
   1990 ; SSE-NEXT:    retq
   1991 ;
   1992 ; AVX1-LABEL: PR34773:
   1993 ; AVX1:       # %bb.0:
   1994 ; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
   1995 ; AVX1-NEXT:    vmovdqu 32(%rdi), %ymm1
   1996 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1997 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
   1998 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
   1999 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2000 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2001 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
   2002 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
   2003 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
   2004 ; AVX1-NEXT:    vmovdqu %xmm0, (%rsi)
   2005 ; AVX1-NEXT:    vmovdqu %xmm1, 16(%rsi)
   2006 ; AVX1-NEXT:    vzeroupper
   2007 ; AVX1-NEXT:    retq
   2008 ;
   2009 ; AVX2-LABEL: PR34773:
   2010 ; AVX2:       # %bb.0:
   2011 ; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
   2012 ; AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
   2013 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
   2014 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
   2015 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2016 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2017 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2018 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
   2019 ; AVX2-NEXT:    vmovdqu %xmm0, (%rsi)
   2020 ; AVX2-NEXT:    vmovdqu %xmm1, 16(%rsi)
   2021 ; AVX2-NEXT:    vzeroupper
   2022 ; AVX2-NEXT:    retq
   2023 ;
   2024 ; AVX512F-LABEL: PR34773:
   2025 ; AVX512F:       # %bb.0:
   2026 ; AVX512F-NEXT:    vmovdqu (%rdi), %ymm0
   2027 ; AVX512F-NEXT:    vmovdqu 32(%rdi), %ymm1
   2028 ; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
   2029 ; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
   2030 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   2031 ; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
   2032 ; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
   2033 ; AVX512F-NEXT:    vpmovdb %zmm1, 16(%rsi)
   2034 ; AVX512F-NEXT:    vzeroupper
   2035 ; AVX512F-NEXT:    retq
   2036 ;
   2037 ; AVX512VL-LABEL: PR34773:
   2038 ; AVX512VL:       # %bb.0:
   2039 ; AVX512VL-NEXT:    vmovdqu (%rdi), %ymm0
   2040 ; AVX512VL-NEXT:    vmovdqu 32(%rdi), %ymm1
   2041 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
   2042 ; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
   2043 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2044 ; AVX512VL-NEXT:    vpmovsxwd %ymm1, %zmm1
   2045 ; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
   2046 ; AVX512VL-NEXT:    vpmovdb %zmm1, 16(%rsi)
   2047 ; AVX512VL-NEXT:    vzeroupper
   2048 ; AVX512VL-NEXT:    retq
   2049 ;
   2050 ; AVX512BW-LABEL: PR34773:
   2051 ; AVX512BW:       # %bb.0:
   2052 ; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
   2053 ; AVX512BW-NEXT:    vmovdqu 32(%rdi), %ymm1
   2054 ; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
   2055 ; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
   2056 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2057 ; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
   2058 ; AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
   2059 ; AVX512BW-NEXT:    vmovdqu %xmm1, 16(%rsi)
   2060 ; AVX512BW-NEXT:    vzeroupper
   2061 ; AVX512BW-NEXT:    retq
   2062 ;
   2063 ; AVX512BWVL-LABEL: PR34773:
   2064 ; AVX512BWVL:       # %bb.0:
   2065 ; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %ymm0
   2066 ; AVX512BWVL-NEXT:    vpsrlw $8, 32(%rdi), %ymm1
   2067 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
   2068 ; AVX512BWVL-NEXT:    vpmovwb %ymm1, 16(%rsi)
   2069 ; AVX512BWVL-NEXT:    vzeroupper
   2070 ; AVX512BWVL-NEXT:    retq
   2071   %1  = getelementptr i16, i16* %a0, i64 16
   2072   %2  = getelementptr i8, i8* %a1, i64 16
   2073   %3  = bitcast i16* %a0 to <16 x i16>*
   2074   %4  = bitcast i16* %1 to <16 x i16>*
   2075   %5  = bitcast i8* %a1 to <16 x i8>*
   2076   %6  = bitcast i8* %2 to <16 x i8>*
   2077   %7  = load <16 x i16>, <16 x i16>* %3, align 2
   2078   %8  = load <16 x i16>, <16 x i16>* %4, align 2
   2079   %9  = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   2080   %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   2081   %11 = trunc <16 x i16> %9  to <16 x i8>
   2082   %12 = trunc <16 x i16> %10 to <16 x i8>
   2083   store <16 x i8> %11, <16 x i8>* %5, align 1
   2084   store <16 x i8> %12, <16 x i8>* %6, align 1
   2085   ret void
   2086 }
   2087