Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
      9 
     10 ;
     11 ; add
     12 ;
     13 
     14 define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
     15 ; SSE-LABEL: trunc_add_v4i64_v4i32:
     16 ; SSE:       # %bb.0:
     17 ; SSE-NEXT:    paddq %xmm3, %xmm1
     18 ; SSE-NEXT:    paddq %xmm2, %xmm0
     19 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
     20 ; SSE-NEXT:    retq
     21 ;
     22 ; AVX1-LABEL: trunc_add_v4i64_v4i32:
     23 ; AVX1:       # %bb.0:
     24 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
     25 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
     26 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
     27 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
     28 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
     29 ; AVX1-NEXT:    vzeroupper
     30 ; AVX1-NEXT:    retq
     31 ;
     32 ; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
     33 ; AVX2-SLOW:       # %bb.0:
     34 ; AVX2-SLOW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
     35 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
     36 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
     37 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
     38 ; AVX2-SLOW-NEXT:    vzeroupper
     39 ; AVX2-SLOW-NEXT:    retq
     40 ;
     41 ; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32:
     42 ; AVX2-FAST:       # %bb.0:
     43 ; AVX2-FAST-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
     44 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
     45 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
     46 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
     47 ; AVX2-FAST-NEXT:    vzeroupper
     48 ; AVX2-FAST-NEXT:    retq
     49 ;
     50 ; AVX512-LABEL: trunc_add_v4i64_v4i32:
     51 ; AVX512:       # %bb.0:
     52 ; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
     53 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
     54 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
     55 ; AVX512-NEXT:    vzeroupper
     56 ; AVX512-NEXT:    retq
     57   %1 = add <4 x i64> %a0, %a1
     58   %2 = trunc <4 x i64> %1 to <4 x i32>
     59   ret <4 x i32> %2
     60 }
     61 
     62 define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
     63 ; SSE-LABEL: trunc_add_v8i64_v8i16:
     64 ; SSE:       # %bb.0:
     65 ; SSE-NEXT:    paddq %xmm6, %xmm2
     66 ; SSE-NEXT:    paddq %xmm7, %xmm3
     67 ; SSE-NEXT:    paddq %xmm4, %xmm0
     68 ; SSE-NEXT:    paddq %xmm5, %xmm1
     69 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
     70 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
     71 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     72 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
     73 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
     74 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
     75 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
     76 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
     77 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
     78 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     79 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
     80 ; SSE-NEXT:    retq
     81 ;
     82 ; AVX1-LABEL: trunc_add_v8i64_v8i16:
     83 ; AVX1:       # %bb.0:
     84 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm4
     85 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
     86 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
     87 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
     88 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm2
     89 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
     90 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
     91 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
     92 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
     93 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
     94 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
     95 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
     96 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
     97 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
     98 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
     99 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    100 ; AVX1-NEXT:    vzeroupper
    101 ; AVX1-NEXT:    retq
    102 ;
    103 ; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
    104 ; AVX2-SLOW:       # %bb.0:
    105 ; AVX2-SLOW-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
    106 ; AVX2-SLOW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    107 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    108 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    109 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    110 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    111 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    112 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    113 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    114 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    115 ; AVX2-SLOW-NEXT:    vzeroupper
    116 ; AVX2-SLOW-NEXT:    retq
    117 ;
    118 ; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16:
    119 ; AVX2-FAST:       # %bb.0:
    120 ; AVX2-FAST-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
    121 ; AVX2-FAST-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    122 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    123 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
    124 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    125 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    126 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    127 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    128 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    129 ; AVX2-FAST-NEXT:    vzeroupper
    130 ; AVX2-FAST-NEXT:    retq
    131 ;
    132 ; AVX512-LABEL: trunc_add_v8i64_v8i16:
    133 ; AVX512:       # %bb.0:
    134 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    135 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
    136 ; AVX512-NEXT:    vzeroupper
    137 ; AVX512-NEXT:    retq
    138   %1 = add <8 x i64> %a0, %a1
    139   %2 = trunc <8 x i64> %1 to <8 x i16>
    140   ret <8 x i16> %2
    141 }
    142 
    143 define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
    144 ; SSE-LABEL: trunc_add_v8i32_v8i16:
    145 ; SSE:       # %bb.0:
    146 ; SSE-NEXT:    paddd %xmm2, %xmm0
    147 ; SSE-NEXT:    paddd %xmm3, %xmm1
    148 ; SSE-NEXT:    pslld $16, %xmm1
    149 ; SSE-NEXT:    psrad $16, %xmm1
    150 ; SSE-NEXT:    pslld $16, %xmm0
    151 ; SSE-NEXT:    psrad $16, %xmm0
    152 ; SSE-NEXT:    packssdw %xmm1, %xmm0
    153 ; SSE-NEXT:    retq
    154 ;
    155 ; AVX1-LABEL: trunc_add_v8i32_v8i16:
    156 ; AVX1:       # %bb.0:
    157 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
    158 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    159 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    160 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
    161 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    162 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    163 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
    164 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
    165 ; AVX1-NEXT:    vzeroupper
    166 ; AVX1-NEXT:    retq
    167 ;
    168 ; AVX2-LABEL: trunc_add_v8i32_v8i16:
    169 ; AVX2:       # %bb.0:
    170 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    171 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    172 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    173 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    174 ; AVX2-NEXT:    vzeroupper
    175 ; AVX2-NEXT:    retq
    176 ;
    177 ; AVX512-LABEL: trunc_add_v8i32_v8i16:
    178 ; AVX512:       # %bb.0:
    179 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    180 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
    181 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    182 ; AVX512-NEXT:    vzeroupper
    183 ; AVX512-NEXT:    retq
    184   %1 = add <8 x i32> %a0, %a1
    185   %2 = trunc <8 x i32> %1 to <8 x i16>
    186   ret <8 x i16> %2
    187 }
    188 
    189 define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
    190 ; SSE-LABEL: trunc_add_v16i64_v16i8:
    191 ; SSE:       # %bb.0:
    192 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm0
    193 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm1
    194 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm2
    195 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm3
    196 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm4
    197 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm5
    198 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm6
    199 ; SSE-NEXT:    paddq {{[0-9]+}}(%rsp), %xmm7
    200 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    201 ; SSE-NEXT:    pand %xmm8, %xmm7
    202 ; SSE-NEXT:    pand %xmm8, %xmm6
    203 ; SSE-NEXT:    packuswb %xmm7, %xmm6
    204 ; SSE-NEXT:    pand %xmm8, %xmm5
    205 ; SSE-NEXT:    pand %xmm8, %xmm4
    206 ; SSE-NEXT:    packuswb %xmm5, %xmm4
    207 ; SSE-NEXT:    packuswb %xmm6, %xmm4
    208 ; SSE-NEXT:    pand %xmm8, %xmm3
    209 ; SSE-NEXT:    pand %xmm8, %xmm2
    210 ; SSE-NEXT:    packuswb %xmm3, %xmm2
    211 ; SSE-NEXT:    pand %xmm8, %xmm1
    212 ; SSE-NEXT:    pand %xmm8, %xmm0
    213 ; SSE-NEXT:    packuswb %xmm1, %xmm0
    214 ; SSE-NEXT:    packuswb %xmm2, %xmm0
    215 ; SSE-NEXT:    packuswb %xmm4, %xmm0
    216 ; SSE-NEXT:    retq
    217 ;
    218 ; AVX1-LABEL: trunc_add_v16i64_v16i8:
    219 ; AVX1:       # %bb.0:
    220 ; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm8
    221 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
    222 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    223 ; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
    224 ; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm4
    225 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
    226 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    227 ; AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
    228 ; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm5
    229 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
    230 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
    231 ; AVX1-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
    232 ; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm6
    233 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
    234 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
    235 ; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
    236 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    237 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
    238 ; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
    239 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
    240 ; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
    241 ; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
    242 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
    243 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
    244 ; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
    245 ; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
    246 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
    247 ; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
    248 ; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
    249 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
    250 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    251 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    252 ; AVX1-NEXT:    vzeroupper
    253 ; AVX1-NEXT:    retq
    254 ;
    255 ; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
    256 ; AVX2-SLOW:       # %bb.0:
    257 ; AVX2-SLOW-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
    258 ; AVX2-SLOW-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
    259 ; AVX2-SLOW-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
    260 ; AVX2-SLOW-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
    261 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
    262 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
    263 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
    264 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
    265 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    266 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    267 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
    268 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
    269 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    270 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
    271 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    272 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    273 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    274 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    275 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    276 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    277 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    278 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
    279 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    280 ; AVX2-SLOW-NEXT:    vzeroupper
    281 ; AVX2-SLOW-NEXT:    retq
    282 ;
    283 ; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8:
    284 ; AVX2-FAST:       # %bb.0:
    285 ; AVX2-FAST-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
    286 ; AVX2-FAST-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
    287 ; AVX2-FAST-NEXT:    vpaddq %ymm7, %ymm3, %ymm3
    288 ; AVX2-FAST-NEXT:    vpaddq %ymm6, %ymm2, %ymm2
    289 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
    290 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
    291 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
    292 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    293 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    294 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
    295 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
    296 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    297 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
    298 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
    299 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
    300 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    301 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    302 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    303 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
    304 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    305 ; AVX2-FAST-NEXT:    vzeroupper
    306 ; AVX2-FAST-NEXT:    retq
    307 ;
    308 ; AVX512-LABEL: trunc_add_v16i64_v16i8:
    309 ; AVX512:       # %bb.0:
    310 ; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
    311 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    312 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
    313 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
    314 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
    315 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
    316 ; AVX512-NEXT:    vzeroupper
    317 ; AVX512-NEXT:    retq
    318   %1 = add <16 x i64> %a0, %a1
    319   %2 = trunc <16 x i64> %1 to <16 x i8>
    320   ret <16 x i8> %2
    321 }
    322 
    323 define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
    324 ; SSE-LABEL: trunc_add_v16i32_v16i8:
    325 ; SSE:       # %bb.0:
    326 ; SSE-NEXT:    paddd %xmm4, %xmm0
    327 ; SSE-NEXT:    paddd %xmm5, %xmm1
    328 ; SSE-NEXT:    paddd %xmm6, %xmm2
    329 ; SSE-NEXT:    paddd %xmm7, %xmm3
    330 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    331 ; SSE-NEXT:    pand %xmm4, %xmm3
    332 ; SSE-NEXT:    pand %xmm4, %xmm2
    333 ; SSE-NEXT:    packuswb %xmm3, %xmm2
    334 ; SSE-NEXT:    pand %xmm4, %xmm1
    335 ; SSE-NEXT:    pand %xmm4, %xmm0
    336 ; SSE-NEXT:    packuswb %xmm1, %xmm0
    337 ; SSE-NEXT:    packuswb %xmm2, %xmm0
    338 ; SSE-NEXT:    retq
    339 ;
    340 ; AVX1-LABEL: trunc_add_v16i32_v16i8:
    341 ; AVX1:       # %bb.0:
    342 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm4
    343 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
    344 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    345 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
    346 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm2
    347 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
    348 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    349 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
    350 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    351 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    352 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    353 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
    354 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    355 ; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
    356 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
    357 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
    358 ; AVX1-NEXT:    vzeroupper
    359 ; AVX1-NEXT:    retq
    360 ;
    361 ; AVX2-LABEL: trunc_add_v16i32_v16i8:
    362 ; AVX2:       # %bb.0:
    363 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    364 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
    365 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    366 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
    367 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    368 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    369 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    370 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
    371 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    372 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    373 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    374 ; AVX2-NEXT:    vzeroupper
    375 ; AVX2-NEXT:    retq
    376 ;
    377 ; AVX512-LABEL: trunc_add_v16i32_v16i8:
    378 ; AVX512:       # %bb.0:
    379 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    380 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
    381 ; AVX512-NEXT:    vzeroupper
    382 ; AVX512-NEXT:    retq
    383   %1 = add <16 x i32> %a0, %a1
    384   %2 = trunc <16 x i32> %1 to <16 x i8>
    385   ret <16 x i8> %2
    386 }
    387 
    388 define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
    389 ; SSE-LABEL: trunc_add_v16i16_v16i8:
    390 ; SSE:       # %bb.0:
    391 ; SSE-NEXT:    paddw %xmm2, %xmm0
    392 ; SSE-NEXT:    paddw %xmm3, %xmm1
    393 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
    394 ; SSE-NEXT:    pand %xmm2, %xmm1
    395 ; SSE-NEXT:    pand %xmm2, %xmm0
    396 ; SSE-NEXT:    packuswb %xmm1, %xmm0
    397 ; SSE-NEXT:    retq
    398 ;
    399 ; AVX1-LABEL: trunc_add_v16i16_v16i8:
    400 ; AVX1:       # %bb.0:
    401 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
    402 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    403 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    404 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    405 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    406 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
    407 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
    408 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
    409 ; AVX1-NEXT:    vzeroupper
    410 ; AVX1-NEXT:    retq
    411 ;
    412 ; AVX2-LABEL: trunc_add_v16i16_v16i8:
    413 ; AVX2:       # %bb.0:
    414 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    415 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    416 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    417 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    418 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    419 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    420 ; AVX2-NEXT:    vzeroupper
    421 ; AVX2-NEXT:    retq
    422 ;
    423 ; AVX512F-LABEL: trunc_add_v16i16_v16i8:
    424 ; AVX512F:       # %bb.0:
    425 ; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    426 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
    427 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    428 ; AVX512F-NEXT:    vzeroupper
    429 ; AVX512F-NEXT:    retq
    430 ;
    431 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
    432 ; AVX512BW:       # %bb.0:
    433 ; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    434 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
    435 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    436 ; AVX512BW-NEXT:    vzeroupper
    437 ; AVX512BW-NEXT:    retq
    438 ;
    439 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
    440 ; AVX512DQ:       # %bb.0:
    441 ; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    442 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
    443 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
    444 ; AVX512DQ-NEXT:    vzeroupper
    445 ; AVX512DQ-NEXT:    retq
    446   %1 = add <16 x i16> %a0, %a1
    447   %2 = trunc <16 x i16> %1 to <16 x i8>
    448   ret <16 x i8> %2
    449 }
    450 
    451 define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
    452 ; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
    453 ; SSE:       # %bb.0:
    454 ; SSE-NEXT:    pslld $16, %xmm2
    455 ; SSE-NEXT:    psrad $16, %xmm2
    456 ; SSE-NEXT:    pslld $16, %xmm1
    457 ; SSE-NEXT:    psrad $16, %xmm1
    458 ; SSE-NEXT:    packssdw %xmm2, %xmm1
    459 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    460 ; SSE-NEXT:    psraw $8, %xmm0
    461 ; SSE-NEXT:    paddw %xmm1, %xmm0
    462 ; SSE-NEXT:    retq
    463 ;
    464 ; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
    465 ; AVX1:       # %bb.0:
    466 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    467 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    468 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
    469 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    470 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
    471 ; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
    472 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    473 ; AVX1-NEXT:    vzeroupper
    474 ; AVX1-NEXT:    retq
    475 ;
    476 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
    477 ; AVX2:       # %bb.0:
    478 ; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
    479 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    480 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    481 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    482 ; AVX2-NEXT:    vzeroupper
    483 ; AVX2-NEXT:    retq
    484 ;
    485 ; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
    486 ; AVX512:       # %bb.0:
    487 ; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
    488 ; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
    489 ; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
    490 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
    491 ; AVX512-NEXT:    vzeroupper
    492 ; AVX512-NEXT:    retq
    493   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    494   %2 = sext <8 x i8> %1 to <8 x i32>
    495   %3 = add <8 x i32> %2, %a1
    496   %4 = trunc <8 x i32> %3 to <8 x i16>
    497   ret <8 x i16> %4
    498 }
    499 
    500 ;
    501 ; add to constant
    502 ;
    503 
    504 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
    505 ; SSE-LABEL: trunc_add_const_v4i64_v4i32:
    506 ; SSE:       # %bb.0:
    507 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    508 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
    509 ; SSE-NEXT:    retq
    510 ;
    511 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
    512 ; AVX1:       # %bb.0:
    513 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    514 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    515 ; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
    516 ; AVX1-NEXT:    vzeroupper
    517 ; AVX1-NEXT:    retq
    518 ;
    519 ; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
    520 ; AVX2-SLOW:       # %bb.0:
    521 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    522 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    523 ; AVX2-SLOW-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
    524 ; AVX2-SLOW-NEXT:    vzeroupper
    525 ; AVX2-SLOW-NEXT:    retq
    526 ;
    527 ; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32:
    528 ; AVX2-FAST:       # %bb.0:
    529 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    530 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    531 ; AVX2-FAST-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
    532 ; AVX2-FAST-NEXT:    vzeroupper
    533 ; AVX2-FAST-NEXT:    retq
    534 ;
    535 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
    536 ; AVX512:       # %bb.0:
    537 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    538 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
    539 ; AVX512-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
    540 ; AVX512-NEXT:    vzeroupper
    541 ; AVX512-NEXT:    retq
    542   %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
    543   %2 = trunc <4 x i64> %1 to <4 x i32>
    544   ret <4 x i32> %2
    545 }
    546 
    547 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
    548 ; SSE-LABEL: trunc_add_const_v8i64_v8i16:
    549 ; SSE:       # %bb.0:
    550 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    551 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
    552 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    553 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
    554 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
    555 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
    556 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
    557 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    558 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
    559 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    560 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
    561 ; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
    562 ; SSE-NEXT:    retq
    563 ;
    564 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
    565 ; AVX1:       # %bb.0:
    566 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    567 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    568 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
    569 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
    570 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    571 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    572 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
    573 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
    574 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    575 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    576 ; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    577 ; AVX1-NEXT:    vzeroupper
    578 ; AVX1-NEXT:    retq
    579 ;
    580 ; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
    581 ; AVX2-SLOW:       # %bb.0:
    582 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    583 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    584 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    585 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    586 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    587 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    588 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    589 ; AVX2-SLOW-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    590 ; AVX2-SLOW-NEXT:    vzeroupper
    591 ; AVX2-SLOW-NEXT:    retq
    592 ;
    593 ; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16:
    594 ; AVX2-FAST:       # %bb.0:
    595 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    596 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
    597 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    598 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    599 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    600 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    601 ; AVX2-FAST-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    602 ; AVX2-FAST-NEXT:    vzeroupper
    603 ; AVX2-FAST-NEXT:    retq
    604 ;
    605 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
    606 ; AVX512:       # %bb.0:
    607 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
    608 ; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    609 ; AVX512-NEXT:    vzeroupper
    610 ; AVX512-NEXT:    retq
    611   %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
    612   %2 = trunc <8 x i64> %1 to <8 x i16>
    613   ret <8 x i16> %2
    614 }
    615 
    616 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
    617 ; SSE-LABEL: trunc_add_const_v8i32_v8i16:
    618 ; SSE:       # %bb.0:
    619 ; SSE-NEXT:    pslld $16, %xmm1
    620 ; SSE-NEXT:    psrad $16, %xmm1
    621 ; SSE-NEXT:    pslld $16, %xmm0
    622 ; SSE-NEXT:    psrad $16, %xmm0
    623 ; SSE-NEXT:    packssdw %xmm1, %xmm0
    624 ; SSE-NEXT:    paddw {{.*}}(%rip), %xmm0
    625 ; SSE-NEXT:    retq
    626 ;
    627 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
    628 ; AVX1:       # %bb.0:
    629 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    630 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    631 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    632 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    633 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    634 ; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    635 ; AVX1-NEXT:    vzeroupper
    636 ; AVX1-NEXT:    retq
    637 ;
    638 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
    639 ; AVX2:       # %bb.0:
    640 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    641 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    642 ; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    643 ; AVX2-NEXT:    vzeroupper
    644 ; AVX2-NEXT:    retq
    645 ;
    646 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
    647 ; AVX512:       # %bb.0:
    648 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    649 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
    650 ; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
    651 ; AVX512-NEXT:    vzeroupper
    652 ; AVX512-NEXT:    retq
    653   %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    654   %2 = trunc <8 x i32> %1 to <8 x i16>
    655   ret <8 x i16> %2
    656 }
    657 
    658 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
    659 ; SSE-LABEL: trunc_add_const_v16i64_v16i8:
    660 ; SSE:       # %bb.0:
    661 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    662 ; SSE-NEXT:    pand %xmm8, %xmm7
    663 ; SSE-NEXT:    pand %xmm8, %xmm6
    664 ; SSE-NEXT:    packuswb %xmm7, %xmm6
    665 ; SSE-NEXT:    pand %xmm8, %xmm5
    666 ; SSE-NEXT:    pand %xmm8, %xmm4
    667 ; SSE-NEXT:    packuswb %xmm5, %xmm4
    668 ; SSE-NEXT:    packuswb %xmm6, %xmm4
    669 ; SSE-NEXT:    pand %xmm8, %xmm3
    670 ; SSE-NEXT:    pand %xmm8, %xmm2
    671 ; SSE-NEXT:    packuswb %xmm3, %xmm2
    672 ; SSE-NEXT:    pand %xmm8, %xmm1
    673 ; SSE-NEXT:    pand %xmm8, %xmm0
    674 ; SSE-NEXT:    packuswb %xmm1, %xmm0
    675 ; SSE-NEXT:    packuswb %xmm2, %xmm0
    676 ; SSE-NEXT:    packuswb %xmm4, %xmm0
    677 ; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
    678 ; SSE-NEXT:    retq
    679 ;
    680 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
    681 ; AVX1:       # %bb.0:
    682 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
    683 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
    684 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
    685 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
    686 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
    687 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
    688 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
    689 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
    690 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
    691 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
    692 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
    693 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
    694 ; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
    695 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
    696 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    697 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
    698 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
    699 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
    700 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    701 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    702 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    703 ; AVX1-NEXT:    vzeroupper
    704 ; AVX1-NEXT:    retq
    705 ;
    706 ; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
    707 ; AVX2-SLOW:       # %bb.0:
    708 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
    709 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
    710 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
    711 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
    712 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    713 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    714 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
    715 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
    716 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    717 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
    718 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    719 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    720 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    721 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    722 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    723 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    724 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    725 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
    726 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    727 ; AVX2-SLOW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    728 ; AVX2-SLOW-NEXT:    vzeroupper
    729 ; AVX2-SLOW-NEXT:    retq
    730 ;
    731 ; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8:
    732 ; AVX2-FAST:       # %bb.0:
    733 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
    734 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
    735 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
    736 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
    737 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    738 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
    739 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
    740 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    741 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
    742 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
    743 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
    744 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    745 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
    746 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    747 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
    748 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
    749 ; AVX2-FAST-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    750 ; AVX2-FAST-NEXT:    vzeroupper
    751 ; AVX2-FAST-NEXT:    retq
    752 ;
    753 ; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
    754 ; AVX512:       # %bb.0:
    755 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
    756 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
    757 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
    758 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
    759 ; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    760 ; AVX512-NEXT:    vzeroupper
    761 ; AVX512-NEXT:    retq
    762   %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
    763   %2 = trunc <16 x i64> %1 to <16 x i8>
    764   ret <16 x i8> %2
    765 }
    766 
    767 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
    768 ; SSE-LABEL: trunc_add_const_v16i32_v16i8:
    769 ; SSE:       # %bb.0:
    770 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    771 ; SSE-NEXT:    pand %xmm4, %xmm3
    772 ; SSE-NEXT:    pand %xmm4, %xmm2
    773 ; SSE-NEXT:    packuswb %xmm3, %xmm2
    774 ; SSE-NEXT:    pand %xmm4, %xmm1
    775 ; SSE-NEXT:    pand %xmm4, %xmm0
    776 ; SSE-NEXT:    packuswb %xmm1, %xmm0
    777 ; SSE-NEXT:    packuswb %xmm2, %xmm0
    778 ; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
    779 ; SSE-NEXT:    retq
    780 ;
    781 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
    782 ; AVX1:       # %bb.0:
    783 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    784 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
    785 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    786 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    787 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
    788 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    789 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    790 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    791 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
    792 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
    793 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    794 ; AVX1-NEXT:    vzeroupper
    795 ; AVX1-NEXT:    retq
    796 ;
    797 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
    798 ; AVX2:       # %bb.0:
    799 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    800 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
    801 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    802 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    803 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
    804 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
    805 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    806 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
    807 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    808 ; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    809 ; AVX2-NEXT:    vzeroupper
    810 ; AVX2-NEXT:    retq
    811 ;
    812 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
    813 ; AVX512:       # %bb.0:
    814 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
    815 ; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    816 ; AVX512-NEXT:    vzeroupper
    817 ; AVX512-NEXT:    retq
    818   %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    819   %2 = trunc <16 x i32> %1 to <16 x i8>
    820   ret <16 x i8> %2
    821 }
    822 
    823 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
    824 ; SSE-LABEL: trunc_add_const_v16i16_v16i8:
    825 ; SSE:       # %bb.0:
    826 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
    827 ; SSE-NEXT:    pand %xmm2, %xmm1
    828 ; SSE-NEXT:    pand %xmm2, %xmm0
    829 ; SSE-NEXT:    packuswb %xmm1, %xmm0
    830 ; SSE-NEXT:    paddb {{.*}}(%rip), %xmm0
    831 ; SSE-NEXT:    retq
    832 ;
    833 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
    834 ; AVX1:       # %bb.0:
    835 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    836 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    837 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    838 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    839 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    840 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    841 ; AVX1-NEXT:    vzeroupper
    842 ; AVX1-NEXT:    retq
    843 ;
    844 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
    845 ; AVX2:       # %bb.0:
    846 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    847 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
    848 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    849 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    850 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    851 ; AVX2-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    852 ; AVX2-NEXT:    vzeroupper
    853 ; AVX2-NEXT:    retq
    854 ;
    855 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
    856 ; AVX512F:       # %bb.0:
    857 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
    858 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    859 ; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    860 ; AVX512F-NEXT:    vzeroupper
    861 ; AVX512F-NEXT:    retq
    862 ;
    863 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
    864 ; AVX512BW:       # %bb.0:
    865 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    866 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
    867 ; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    868 ; AVX512BW-NEXT:    vzeroupper
    869 ; AVX512BW-NEXT:    retq
    870 ;
    871 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
    872 ; AVX512DQ:       # %bb.0:
    873 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
    874 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
    875 ; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
    876 ; AVX512DQ-NEXT:    vzeroupper
    877 ; AVX512DQ-NEXT:    retq
    878   %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
    879   %2 = trunc <16 x i16> %1 to <16 x i8>
    880   ret <16 x i8> %2
    881 }
    882 
    883 ;
    884 ; sub
    885 ;
    886 
    887 define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    888 ; SSE-LABEL: trunc_sub_v4i64_v4i32:
    889 ; SSE:       # %bb.0:
    890 ; SSE-NEXT:    psubq %xmm3, %xmm1
    891 ; SSE-NEXT:    psubq %xmm2, %xmm0
    892 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
    893 ; SSE-NEXT:    retq
    894 ;
    895 ; AVX1-LABEL: trunc_sub_v4i64_v4i32:
    896 ; AVX1:       # %bb.0:
    897 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    898 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    899 ; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
    900 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
    901 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
    902 ; AVX1-NEXT:    vzeroupper
    903 ; AVX1-NEXT:    retq
    904 ;
    905 ; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
    906 ; AVX2-SLOW:       # %bb.0:
    907 ; AVX2-SLOW-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
    908 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    909 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    910 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    911 ; AVX2-SLOW-NEXT:    vzeroupper
    912 ; AVX2-SLOW-NEXT:    retq
    913 ;
    914 ; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32:
    915 ; AVX2-FAST:       # %bb.0:
    916 ; AVX2-FAST-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
    917 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
    918 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    919 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    920 ; AVX2-FAST-NEXT:    vzeroupper
    921 ; AVX2-FAST-NEXT:    retq
    922 ;
    923 ; AVX512-LABEL: trunc_sub_v4i64_v4i32:
    924 ; AVX512:       # %bb.0:
    925 ; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
    926 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
    927 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    928 ; AVX512-NEXT:    vzeroupper
    929 ; AVX512-NEXT:    retq
    930   %1 = sub <4 x i64> %a0, %a1
    931   %2 = trunc <4 x i64> %1 to <4 x i32>
    932   ret <4 x i32> %2
    933 }
    934 
    935 define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
    936 ; SSE-LABEL: trunc_sub_v8i64_v8i16:
    937 ; SSE:       # %bb.0:
    938 ; SSE-NEXT:    psubq %xmm6, %xmm2
    939 ; SSE-NEXT:    psubq %xmm7, %xmm3
    940 ; SSE-NEXT:    psubq %xmm4, %xmm0
    941 ; SSE-NEXT:    psubq %xmm5, %xmm1
    942 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    943 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
    944 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    945 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
    946 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
    947 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
    948 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
    949 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
    950 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
    951 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    952 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
    953 ; SSE-NEXT:    retq
    954 ;
    955 ; AVX1-LABEL: trunc_sub_v8i64_v8i16:
    956 ; AVX1:       # %bb.0:
    957 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm4
    958 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
    959 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    960 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
    961 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm2
    962 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
    963 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    964 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
    965 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    966 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
    967 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
    968 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
    969 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
    970 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
    971 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
    972 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    973 ; AVX1-NEXT:    vzeroupper
    974 ; AVX1-NEXT:    retq
    975 ;
    976 ; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
    977 ; AVX2-SLOW:       # %bb.0:
    978 ; AVX2-SLOW-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
    979 ; AVX2-SLOW-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
    980 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
    981 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    982 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
    983 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
    984 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    985 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
    986 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    987 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    988 ; AVX2-SLOW-NEXT:    vzeroupper
    989 ; AVX2-SLOW-NEXT:    retq
    990 ;
    991 ; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16:
    992 ; AVX2-FAST:       # %bb.0:
    993 ; AVX2-FAST-NEXT:    vpsubq %ymm3, %ymm1, %ymm1
    994 ; AVX2-FAST-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
    995 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
    996 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
    997 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    998 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    999 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1000 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1001 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1002 ; AVX2-FAST-NEXT:    vzeroupper
   1003 ; AVX2-FAST-NEXT:    retq
   1004 ;
   1005 ; AVX512-LABEL: trunc_sub_v8i64_v8i16:
   1006 ; AVX512:       # %bb.0:
   1007 ; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
   1008 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   1009 ; AVX512-NEXT:    vzeroupper
   1010 ; AVX512-NEXT:    retq
   1011   %1 = sub <8 x i64> %a0, %a1
   1012   %2 = trunc <8 x i64> %1 to <8 x i16>
   1013   ret <8 x i16> %2
   1014 }
   1015 
   1016 define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
   1017 ; SSE-LABEL: trunc_sub_v8i32_v8i16:
   1018 ; SSE:       # %bb.0:
   1019 ; SSE-NEXT:    psubd %xmm2, %xmm0
   1020 ; SSE-NEXT:    psubd %xmm3, %xmm1
   1021 ; SSE-NEXT:    pslld $16, %xmm1
   1022 ; SSE-NEXT:    psrad $16, %xmm1
   1023 ; SSE-NEXT:    pslld $16, %xmm0
   1024 ; SSE-NEXT:    psrad $16, %xmm0
   1025 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   1026 ; SSE-NEXT:    retq
   1027 ;
   1028 ; AVX1-LABEL: trunc_sub_v8i32_v8i16:
   1029 ; AVX1:       # %bb.0:
   1030 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm2
   1031 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1032 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1033 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
   1034 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1035 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   1036 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
   1037 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1038 ; AVX1-NEXT:    vzeroupper
   1039 ; AVX1-NEXT:    retq
   1040 ;
   1041 ; AVX2-LABEL: trunc_sub_v8i32_v8i16:
   1042 ; AVX2:       # %bb.0:
   1043 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
   1044 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1045 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1046 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1047 ; AVX2-NEXT:    vzeroupper
   1048 ; AVX2-NEXT:    retq
   1049 ;
   1050 ; AVX512-LABEL: trunc_sub_v8i32_v8i16:
   1051 ; AVX512:       # %bb.0:
   1052 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
   1053 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   1054 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1055 ; AVX512-NEXT:    vzeroupper
   1056 ; AVX512-NEXT:    retq
   1057   %1 = sub <8 x i32> %a0, %a1
   1058   %2 = trunc <8 x i32> %1 to <8 x i16>
   1059   ret <8 x i16> %2
   1060 }
   1061 
   1062 define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
   1063 ; SSE-LABEL: trunc_sub_v16i64_v16i8:
   1064 ; SSE:       # %bb.0:
   1065 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm0
   1066 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm1
   1067 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm2
   1068 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm3
   1069 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm4
   1070 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm5
   1071 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm6
   1072 ; SSE-NEXT:    psubq {{[0-9]+}}(%rsp), %xmm7
   1073 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   1074 ; SSE-NEXT:    pand %xmm8, %xmm7
   1075 ; SSE-NEXT:    pand %xmm8, %xmm6
   1076 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   1077 ; SSE-NEXT:    pand %xmm8, %xmm5
   1078 ; SSE-NEXT:    pand %xmm8, %xmm4
   1079 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   1080 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   1081 ; SSE-NEXT:    pand %xmm8, %xmm3
   1082 ; SSE-NEXT:    pand %xmm8, %xmm2
   1083 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   1084 ; SSE-NEXT:    pand %xmm8, %xmm1
   1085 ; SSE-NEXT:    pand %xmm8, %xmm0
   1086 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1087 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   1088 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   1089 ; SSE-NEXT:    retq
   1090 ;
   1091 ; AVX1-LABEL: trunc_sub_v16i64_v16i8:
   1092 ; AVX1:       # %bb.0:
   1093 ; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
   1094 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm4
   1095 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1096 ; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
   1097 ; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm4
   1098 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm5
   1099 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1100 ; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm1
   1101 ; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
   1102 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm6
   1103 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   1104 ; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
   1105 ; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm6
   1106 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
   1107 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   1108 ; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
   1109 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   1110 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
   1111 ; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
   1112 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
   1113 ; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
   1114 ; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
   1115 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
   1116 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   1117 ; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
   1118 ; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm3
   1119 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
   1120 ; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0
   1121 ; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm3
   1122 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
   1123 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   1124 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   1125 ; AVX1-NEXT:    vzeroupper
   1126 ; AVX1-NEXT:    retq
   1127 ;
   1128 ; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
   1129 ; AVX2-SLOW:       # %bb.0:
   1130 ; AVX2-SLOW-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
   1131 ; AVX2-SLOW-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
   1132 ; AVX2-SLOW-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
   1133 ; AVX2-SLOW-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
   1134 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   1135 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1136 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   1137 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   1138 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   1139 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1140 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   1141 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1142 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1143 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   1144 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1145 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1146 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   1147 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1148 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1149 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   1150 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1151 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   1152 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1153 ; AVX2-SLOW-NEXT:    vzeroupper
   1154 ; AVX2-SLOW-NEXT:    retq
   1155 ;
   1156 ; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8:
   1157 ; AVX2-FAST:       # %bb.0:
   1158 ; AVX2-FAST-NEXT:    vpsubq %ymm5, %ymm1, %ymm1
   1159 ; AVX2-FAST-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
   1160 ; AVX2-FAST-NEXT:    vpsubq %ymm7, %ymm3, %ymm3
   1161 ; AVX2-FAST-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
   1162 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   1163 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   1164 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   1165 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   1166 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1167 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   1168 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1169 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1170 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   1171 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   1172 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   1173 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1174 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   1175 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1176 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1177 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1178 ; AVX2-FAST-NEXT:    vzeroupper
   1179 ; AVX2-FAST-NEXT:    retq
   1180 ;
   1181 ; AVX512-LABEL: trunc_sub_v16i64_v16i8:
   1182 ; AVX512:       # %bb.0:
   1183 ; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
   1184 ; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
   1185 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   1186 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   1187 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   1188 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   1189 ; AVX512-NEXT:    vzeroupper
   1190 ; AVX512-NEXT:    retq
   1191   %1 = sub <16 x i64> %a0, %a1
   1192   %2 = trunc <16 x i64> %1 to <16 x i8>
   1193   ret <16 x i8> %2
   1194 }
   1195 
   1196 define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
   1197 ; SSE-LABEL: trunc_sub_v16i32_v16i8:
   1198 ; SSE:       # %bb.0:
   1199 ; SSE-NEXT:    psubd %xmm4, %xmm0
   1200 ; SSE-NEXT:    psubd %xmm5, %xmm1
   1201 ; SSE-NEXT:    psubd %xmm6, %xmm2
   1202 ; SSE-NEXT:    psubd %xmm7, %xmm3
   1203 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   1204 ; SSE-NEXT:    pand %xmm4, %xmm3
   1205 ; SSE-NEXT:    pand %xmm4, %xmm2
   1206 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   1207 ; SSE-NEXT:    pand %xmm4, %xmm1
   1208 ; SSE-NEXT:    pand %xmm4, %xmm0
   1209 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1210 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   1211 ; SSE-NEXT:    retq
   1212 ;
   1213 ; AVX1-LABEL: trunc_sub_v16i32_v16i8:
   1214 ; AVX1:       # %bb.0:
   1215 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm4
   1216 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   1217 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1218 ; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
   1219 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm2
   1220 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   1221 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1222 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
   1223 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   1224 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   1225 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   1226 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
   1227 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   1228 ; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
   1229 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
   1230 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   1231 ; AVX1-NEXT:    vzeroupper
   1232 ; AVX1-NEXT:    retq
   1233 ;
   1234 ; AVX2-LABEL: trunc_sub_v16i32_v16i8:
   1235 ; AVX2:       # %bb.0:
   1236 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
   1237 ; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
   1238 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1239 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   1240 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1241 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1242 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1243 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   1244 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1245 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1246 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1247 ; AVX2-NEXT:    vzeroupper
   1248 ; AVX2-NEXT:    retq
   1249 ;
   1250 ; AVX512-LABEL: trunc_sub_v16i32_v16i8:
   1251 ; AVX512:       # %bb.0:
   1252 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
   1253 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   1254 ; AVX512-NEXT:    vzeroupper
   1255 ; AVX512-NEXT:    retq
   1256   %1 = sub <16 x i32> %a0, %a1
   1257   %2 = trunc <16 x i32> %1 to <16 x i8>
   1258   ret <16 x i8> %2
   1259 }
   1260 
   1261 define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
   1262 ; SSE-LABEL: trunc_sub_v16i16_v16i8:
   1263 ; SSE:       # %bb.0:
   1264 ; SSE-NEXT:    psubw %xmm2, %xmm0
   1265 ; SSE-NEXT:    psubw %xmm3, %xmm1
   1266 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   1267 ; SSE-NEXT:    pand %xmm2, %xmm1
   1268 ; SSE-NEXT:    pand %xmm2, %xmm0
   1269 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1270 ; SSE-NEXT:    retq
   1271 ;
   1272 ; AVX1-LABEL: trunc_sub_v16i16_v16i8:
   1273 ; AVX1:       # %bb.0:
   1274 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
   1275 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1276 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1277 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
   1278 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1279 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   1280 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
   1281 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1282 ; AVX1-NEXT:    vzeroupper
   1283 ; AVX1-NEXT:    retq
   1284 ;
   1285 ; AVX2-LABEL: trunc_sub_v16i16_v16i8:
   1286 ; AVX2:       # %bb.0:
   1287 ; AVX2-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
   1288 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1289 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1290 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1291 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1292 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1293 ; AVX2-NEXT:    vzeroupper
   1294 ; AVX2-NEXT:    retq
   1295 ;
   1296 ; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
   1297 ; AVX512F:       # %bb.0:
   1298 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
   1299 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   1300 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   1301 ; AVX512F-NEXT:    vzeroupper
   1302 ; AVX512F-NEXT:    retq
   1303 ;
   1304 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
   1305 ; AVX512BW:       # %bb.0:
   1306 ; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
   1307 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1308 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1309 ; AVX512BW-NEXT:    vzeroupper
   1310 ; AVX512BW-NEXT:    retq
   1311 ;
   1312 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
   1313 ; AVX512DQ:       # %bb.0:
   1314 ; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
   1315 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   1316 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   1317 ; AVX512DQ-NEXT:    vzeroupper
   1318 ; AVX512DQ-NEXT:    retq
   1319   %1 = sub <16 x i16> %a0, %a1
   1320   %2 = trunc <16 x i16> %1 to <16 x i8>
   1321   ret <16 x i8> %2
   1322 }
   1323 
   1324 ;
   1325 ; sub to constant
   1326 ;
   1327 
   1328 define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
   1329 ; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
   1330 ; SSE:       # %bb.0:
   1331 ; SSE-NEXT:    movl $1, %eax
   1332 ; SSE-NEXT:    movq %rax, %xmm2
   1333 ; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
   1334 ; SSE-NEXT:    psubq %xmm2, %xmm0
   1335 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
   1336 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1337 ; SSE-NEXT:    retq
   1338 ;
   1339 ; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
   1340 ; AVX1:       # %bb.0:
   1341 ; AVX1-NEXT:    movl $1, %eax
   1342 ; AVX1-NEXT:    vmovq %rax, %xmm1
   1343 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   1344 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
   1345 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1346 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
   1347 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
   1348 ; AVX1-NEXT:    vzeroupper
   1349 ; AVX1-NEXT:    retq
   1350 ;
   1351 ; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
   1352 ; AVX2-SLOW:       # %bb.0:
   1353 ; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
   1354 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1355 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1356 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1357 ; AVX2-SLOW-NEXT:    vzeroupper
   1358 ; AVX2-SLOW-NEXT:    retq
   1359 ;
   1360 ; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
   1361 ; AVX2-FAST:       # %bb.0:
   1362 ; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
   1363 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   1364 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
   1365 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1366 ; AVX2-FAST-NEXT:    vzeroupper
   1367 ; AVX2-FAST-NEXT:    retq
   1368 ;
   1369 ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
   1370 ; AVX512:       # %bb.0:
   1371 ; AVX512-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
   1372 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   1373 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1374 ; AVX512-NEXT:    vzeroupper
   1375 ; AVX512-NEXT:    retq
   1376   %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   1377   %2 = trunc <4 x i64> %1 to <4 x i32>
   1378   ret <4 x i32> %2
   1379 }
   1380 
   1381 define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
   1382 ; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
   1383 ; SSE:       # %bb.0:
   1384 ; SSE-NEXT:    movl $1, %eax
   1385 ; SSE-NEXT:    movq %rax, %xmm4
   1386 ; SSE-NEXT:    pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
   1387 ; SSE-NEXT:    psubq %xmm4, %xmm0
   1388 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
   1389 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
   1390 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
   1391 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
   1392 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
   1393 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
   1394 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
   1395 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   1396 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1397 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
   1398 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1399 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1400 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1401 ; SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
   1402 ; SSE-NEXT:    movapd %xmm2, %xmm0
   1403 ; SSE-NEXT:    retq
   1404 ;
   1405 ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
   1406 ; AVX1:       # %bb.0:
   1407 ; AVX1-NEXT:    movl $1, %eax
   1408 ; AVX1-NEXT:    vmovq %rax, %xmm2
   1409 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
   1410 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
   1411 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1412 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
   1413 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm3
   1414 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1415 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
   1416 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
   1417 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
   1418 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
   1419 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
   1420 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
   1421 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
   1422 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
   1423 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   1424 ; AVX1-NEXT:    vzeroupper
   1425 ; AVX1-NEXT:    retq
   1426 ;
   1427 ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
   1428 ; AVX2-SLOW:       # %bb.0:
   1429 ; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
   1430 ; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
   1431 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1432 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1433 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   1434 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1435 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1436 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1437 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1438 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1439 ; AVX2-SLOW-NEXT:    vzeroupper
   1440 ; AVX2-SLOW-NEXT:    retq
   1441 ;
   1442 ; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
   1443 ; AVX2-FAST:       # %bb.0:
   1444 ; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
   1445 ; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
   1446 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   1447 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   1448 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   1449 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1450 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1451 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1452 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1453 ; AVX2-FAST-NEXT:    vzeroupper
   1454 ; AVX2-FAST-NEXT:    retq
   1455 ;
   1456 ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
   1457 ; AVX512:       # %bb.0:
   1458 ; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
   1459 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   1460 ; AVX512-NEXT:    vzeroupper
   1461 ; AVX512-NEXT:    retq
   1462   %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   1463   %2 = trunc <8 x i64> %1 to <8 x i16>
   1464   ret <8 x i16> %2
   1465 }
   1466 
   1467 define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
   1468 ; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
   1469 ; SSE:       # %bb.0:
   1470 ; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
   1471 ; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
   1472 ; SSE-NEXT:    pslld $16, %xmm1
   1473 ; SSE-NEXT:    psrad $16, %xmm1
   1474 ; SSE-NEXT:    pslld $16, %xmm0
   1475 ; SSE-NEXT:    psrad $16, %xmm0
   1476 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   1477 ; SSE-NEXT:    retq
   1478 ;
   1479 ; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
   1480 ; AVX1:       # %bb.0:
   1481 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm1
   1482 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1483 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
   1484 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   1485 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1486 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1487 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1488 ; AVX1-NEXT:    vzeroupper
   1489 ; AVX1-NEXT:    retq
   1490 ;
   1491 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
   1492 ; AVX2:       # %bb.0:
   1493 ; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
   1494 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1495 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1496 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1497 ; AVX2-NEXT:    vzeroupper
   1498 ; AVX2-NEXT:    retq
   1499 ;
   1500 ; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
   1501 ; AVX512:       # %bb.0:
   1502 ; AVX512-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
   1503 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   1504 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1505 ; AVX512-NEXT:    vzeroupper
   1506 ; AVX512-NEXT:    retq
   1507   %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1508   %2 = trunc <8 x i32> %1 to <8 x i16>
   1509   ret <8 x i16> %2
   1510 }
   1511 
   1512 define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
   1513 ; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
   1514 ; SSE:       # %bb.0:
   1515 ; SSE-NEXT:    movl $1, %eax
   1516 ; SSE-NEXT:    movq %rax, %xmm8
   1517 ; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
   1518 ; SSE-NEXT:    psubq %xmm8, %xmm0
   1519 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm1
   1520 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm2
   1521 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm3
   1522 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm4
   1523 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm5
   1524 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm6
   1525 ; SSE-NEXT:    psubq {{.*}}(%rip), %xmm7
   1526 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   1527 ; SSE-NEXT:    pand %xmm8, %xmm7
   1528 ; SSE-NEXT:    pand %xmm8, %xmm6
   1529 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   1530 ; SSE-NEXT:    pand %xmm8, %xmm5
   1531 ; SSE-NEXT:    pand %xmm8, %xmm4
   1532 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   1533 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   1534 ; SSE-NEXT:    pand %xmm8, %xmm3
   1535 ; SSE-NEXT:    pand %xmm8, %xmm2
   1536 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   1537 ; SSE-NEXT:    pand %xmm8, %xmm1
   1538 ; SSE-NEXT:    pand %xmm8, %xmm0
   1539 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1540 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   1541 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   1542 ; SSE-NEXT:    retq
   1543 ;
   1544 ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
   1545 ; AVX1:       # %bb.0:
   1546 ; AVX1-NEXT:    movl $1, %eax
   1547 ; AVX1-NEXT:    vmovq %rax, %xmm4
   1548 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
   1549 ; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm8
   1550 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1551 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
   1552 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm5
   1553 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1554 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
   1555 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm6
   1556 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   1557 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
   1558 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm7
   1559 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   1560 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
   1561 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   1562 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
   1563 ; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
   1564 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
   1565 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   1566 ; AVX1-NEXT:    vpand %xmm4, %xmm6, %xmm6
   1567 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm6, %xmm2
   1568 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   1569 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   1570 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
   1571 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
   1572 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   1573 ; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
   1574 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
   1575 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   1576 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   1577 ; AVX1-NEXT:    vzeroupper
   1578 ; AVX1-NEXT:    retq
   1579 ;
   1580 ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
   1581 ; AVX2-SLOW:       # %bb.0:
   1582 ; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
   1583 ; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
   1584 ; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm3, %ymm3
   1585 ; AVX2-SLOW-NEXT:    vpsubq {{.*}}(%rip), %ymm2, %ymm2
   1586 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   1587 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1588 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   1589 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   1590 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   1591 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1592 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   1593 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1594 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1595 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   1596 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1597 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1598 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   1599 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1600 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1601 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   1602 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1603 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   1604 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1605 ; AVX2-SLOW-NEXT:    vzeroupper
   1606 ; AVX2-SLOW-NEXT:    retq
   1607 ;
   1608 ; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
   1609 ; AVX2-FAST:       # %bb.0:
   1610 ; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm1, %ymm1
   1611 ; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
   1612 ; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm3, %ymm3
   1613 ; AVX2-FAST-NEXT:    vpsubq {{.*}}(%rip), %ymm2, %ymm2
   1614 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   1615 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   1616 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   1617 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   1618 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1619 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   1620 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1621 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1622 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   1623 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   1624 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   1625 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1626 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   1627 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1628 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   1629 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   1630 ; AVX2-FAST-NEXT:    vzeroupper
   1631 ; AVX2-FAST-NEXT:    retq
   1632 ;
   1633 ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
   1634 ; AVX512:       # %bb.0:
   1635 ; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
   1636 ; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
   1637 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   1638 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   1639 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   1640 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   1641 ; AVX512-NEXT:    vzeroupper
   1642 ; AVX512-NEXT:    retq
   1643   %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   1644   %2 = trunc <16 x i64> %1 to <16 x i8>
   1645   ret <16 x i8> %2
   1646 }
   1647 
   1648 define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
   1649 ; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
   1650 ; SSE:       # %bb.0:
   1651 ; SSE-NEXT:    psubd {{.*}}(%rip), %xmm0
   1652 ; SSE-NEXT:    psubd {{.*}}(%rip), %xmm1
   1653 ; SSE-NEXT:    psubd {{.*}}(%rip), %xmm2
   1654 ; SSE-NEXT:    psubd {{.*}}(%rip), %xmm3
   1655 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   1656 ; SSE-NEXT:    pand %xmm4, %xmm3
   1657 ; SSE-NEXT:    pand %xmm4, %xmm2
   1658 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   1659 ; SSE-NEXT:    pand %xmm4, %xmm1
   1660 ; SSE-NEXT:    pand %xmm4, %xmm0
   1661 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1662 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   1663 ; SSE-NEXT:    retq
   1664 ;
   1665 ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
   1666 ; AVX1:       # %bb.0:
   1667 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm2
   1668 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1669 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
   1670 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm3
   1671 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1672 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm1
   1673 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   1674 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   1675 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
   1676 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
   1677 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   1678 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   1679 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
   1680 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   1681 ; AVX1-NEXT:    vzeroupper
   1682 ; AVX1-NEXT:    retq
   1683 ;
   1684 ; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
   1685 ; AVX2:       # %bb.0:
   1686 ; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
   1687 ; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm1, %ymm1
   1688 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1689 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   1690 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1691 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1692 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1693 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   1694 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1695 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1696 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1697 ; AVX2-NEXT:    vzeroupper
   1698 ; AVX2-NEXT:    retq
   1699 ;
   1700 ; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
   1701 ; AVX512:       # %bb.0:
   1702 ; AVX512-NEXT:    vpsubd {{.*}}(%rip), %zmm0, %zmm0
   1703 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   1704 ; AVX512-NEXT:    vzeroupper
   1705 ; AVX512-NEXT:    retq
   1706   %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   1707   %2 = trunc <16 x i32> %1 to <16 x i8>
   1708   ret <16 x i8> %2
   1709 }
   1710 
   1711 define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
   1712 ; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
   1713 ; SSE:       # %bb.0:
   1714 ; SSE-NEXT:    psubw {{.*}}(%rip), %xmm0
   1715 ; SSE-NEXT:    psubw {{.*}}(%rip), %xmm1
   1716 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   1717 ; SSE-NEXT:    pand %xmm2, %xmm1
   1718 ; SSE-NEXT:    pand %xmm2, %xmm0
   1719 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   1720 ; SSE-NEXT:    retq
   1721 ;
   1722 ; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
   1723 ; AVX1:       # %bb.0:
   1724 ; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm1
   1725 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1726 ; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
   1727 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1728 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1729 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1730 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1731 ; AVX1-NEXT:    vzeroupper
   1732 ; AVX1-NEXT:    retq
   1733 ;
   1734 ; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
   1735 ; AVX2:       # %bb.0:
   1736 ; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
   1737 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1738 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1739 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1740 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1741 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1742 ; AVX2-NEXT:    vzeroupper
   1743 ; AVX2-NEXT:    retq
   1744 ;
   1745 ; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
   1746 ; AVX512F:       # %bb.0:
   1747 ; AVX512F-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
   1748 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   1749 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   1750 ; AVX512F-NEXT:    vzeroupper
   1751 ; AVX512F-NEXT:    retq
   1752 ;
   1753 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
   1754 ; AVX512BW:       # %bb.0:
   1755 ; AVX512BW-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
   1756 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1757 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1758 ; AVX512BW-NEXT:    vzeroupper
   1759 ; AVX512BW-NEXT:    retq
   1760 ;
   1761 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
   1762 ; AVX512DQ:       # %bb.0:
   1763 ; AVX512DQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
   1764 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   1765 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   1766 ; AVX512DQ-NEXT:    vzeroupper
   1767 ; AVX512DQ-NEXT:    retq
   1768   %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   1769   %2 = trunc <16 x i16> %1 to <16 x i8>
   1770   ret <16 x i8> %2
   1771 }
   1772 
   1773 ;
   1774 ; mul
   1775 ;
   1776 
   1777 define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   1778 ; SSE-LABEL: trunc_mul_v4i64_v4i32:
   1779 ; SSE:       # %bb.0:
   1780 ; SSE-NEXT:    movdqa %xmm1, %xmm4
   1781 ; SSE-NEXT:    psrlq $32, %xmm4
   1782 ; SSE-NEXT:    pmuludq %xmm3, %xmm4
   1783 ; SSE-NEXT:    movdqa %xmm3, %xmm5
   1784 ; SSE-NEXT:    psrlq $32, %xmm5
   1785 ; SSE-NEXT:    pmuludq %xmm1, %xmm5
   1786 ; SSE-NEXT:    paddq %xmm4, %xmm5
   1787 ; SSE-NEXT:    psllq $32, %xmm5
   1788 ; SSE-NEXT:    pmuludq %xmm3, %xmm1
   1789 ; SSE-NEXT:    paddq %xmm5, %xmm1
   1790 ; SSE-NEXT:    movdqa %xmm0, %xmm3
   1791 ; SSE-NEXT:    psrlq $32, %xmm3
   1792 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
   1793 ; SSE-NEXT:    movdqa %xmm2, %xmm4
   1794 ; SSE-NEXT:    psrlq $32, %xmm4
   1795 ; SSE-NEXT:    pmuludq %xmm0, %xmm4
   1796 ; SSE-NEXT:    paddq %xmm3, %xmm4
   1797 ; SSE-NEXT:    psllq $32, %xmm4
   1798 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
   1799 ; SSE-NEXT:    paddq %xmm4, %xmm0
   1800 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   1801 ; SSE-NEXT:    retq
   1802 ;
   1803 ; AVX1-LABEL: trunc_mul_v4i64_v4i32:
   1804 ; AVX1:       # %bb.0:
   1805 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1806 ; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
   1807 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1808 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
   1809 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1810 ; AVX1-NEXT:    vzeroupper
   1811 ; AVX1-NEXT:    retq
   1812 ;
   1813 ; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
   1814 ; AVX2-SLOW:       # %bb.0:
   1815 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   1816 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1817 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1818 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1819 ; AVX2-SLOW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1820 ; AVX2-SLOW-NEXT:    vzeroupper
   1821 ; AVX2-SLOW-NEXT:    retq
   1822 ;
   1823 ; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32:
   1824 ; AVX2-FAST:       # %bb.0:
   1825 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   1826 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   1827 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   1828 ; AVX2-FAST-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1829 ; AVX2-FAST-NEXT:    vzeroupper
   1830 ; AVX2-FAST-NEXT:    retq
   1831 ;
   1832 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
   1833 ; AVX512F:       # %bb.0:
   1834 ; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   1835 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1836 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
   1837 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
   1838 ; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1839 ; AVX512F-NEXT:    vzeroupper
   1840 ; AVX512F-NEXT:    retq
   1841 ;
   1842 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
   1843 ; AVX512BW:       # %bb.0:
   1844 ; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   1845 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1846 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
   1847 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
   1848 ; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1849 ; AVX512BW-NEXT:    vzeroupper
   1850 ; AVX512BW-NEXT:    retq
   1851 ;
   1852 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
   1853 ; AVX512DQ:       # %bb.0:
   1854 ; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   1855 ; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   1856 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
   1857 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
   1858 ; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   1859 ; AVX512DQ-NEXT:    vzeroupper
   1860 ; AVX512DQ-NEXT:    retq
   1861   %1 = mul <4 x i64> %a0, %a1
   1862   %2 = trunc <4 x i64> %1 to <4 x i32>
   1863   ret <4 x i32> %2
   1864 }
   1865 
   1866 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
   1867 ; SSE-LABEL: trunc_mul_v8i64_v8i16:
   1868 ; SSE:       # %bb.0:
   1869 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
   1870 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
   1871 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
   1872 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
   1873 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
   1874 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
   1875 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
   1876 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
   1877 ; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
   1878 ; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
   1879 ; SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
   1880 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1881 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   1882 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1883 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   1884 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   1885 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   1886 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   1887 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   1888 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   1889 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1890 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   1891 ; SSE-NEXT:    pmullw %xmm6, %xmm0
   1892 ; SSE-NEXT:    retq
   1893 ;
   1894 ; AVX1-LABEL: trunc_mul_v8i64_v8i16:
   1895 ; AVX1:       # %bb.0:
   1896 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
   1897 ; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
   1898 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
   1899 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
   1900 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   1901 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   1902 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
   1903 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
   1904 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
   1905 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   1906 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1907 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
   1908 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
   1909 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   1910 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1911 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
   1912 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
   1913 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   1914 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   1915 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1916 ; AVX1-NEXT:    vzeroupper
   1917 ; AVX1-NEXT:    retq
   1918 ;
   1919 ; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
   1920 ; AVX2-SLOW:       # %bb.0:
   1921 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   1922 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1923 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   1924 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   1925 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   1926 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1927 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   1928 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1929 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   1930 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1931 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   1932 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   1933 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1934 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   1935 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1936 ; AVX2-SLOW-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1937 ; AVX2-SLOW-NEXT:    vzeroupper
   1938 ; AVX2-SLOW-NEXT:    retq
   1939 ;
   1940 ; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16:
   1941 ; AVX2-FAST:       # %bb.0:
   1942 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   1943 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   1944 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   1945 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   1946 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   1947 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   1948 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   1949 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   1950 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   1951 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1952 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   1953 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   1954 ; AVX2-FAST-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1955 ; AVX2-FAST-NEXT:    vzeroupper
   1956 ; AVX2-FAST-NEXT:    retq
   1957 ;
   1958 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
   1959 ; AVX512F:       # %bb.0:
   1960 ; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
   1961 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
   1962 ; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1963 ; AVX512F-NEXT:    vzeroupper
   1964 ; AVX512F-NEXT:    retq
   1965 ;
   1966 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
   1967 ; AVX512BW:       # %bb.0:
   1968 ; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
   1969 ; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
   1970 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1971 ; AVX512BW-NEXT:    vzeroupper
   1972 ; AVX512BW-NEXT:    retq
   1973 ;
   1974 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
   1975 ; AVX512DQ:       # %bb.0:
   1976 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
   1977 ; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
   1978 ; AVX512DQ-NEXT:    vzeroupper
   1979 ; AVX512DQ-NEXT:    retq
   1980   %1 = mul <8 x i64> %a0, %a1
   1981   %2 = trunc <8 x i64> %1 to <8 x i16>
   1982   ret <8 x i16> %2
   1983 }
   1984 
   1985 define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
   1986 ; SSE-LABEL: trunc_mul_v8i32_v8i16:
   1987 ; SSE:       # %bb.0:
   1988 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
   1989 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
   1990 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1991 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
   1992 ; SSE-NEXT:    pmuludq %xmm4, %xmm2
   1993 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
   1994 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1995 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
   1996 ; SSE-NEXT:    pmuludq %xmm3, %xmm1
   1997 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1998 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
   1999 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
   2000 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
   2001 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2002 ; SSE-NEXT:    pslld $16, %xmm1
   2003 ; SSE-NEXT:    psrad $16, %xmm1
   2004 ; SSE-NEXT:    pslld $16, %xmm0
   2005 ; SSE-NEXT:    psrad $16, %xmm0
   2006 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   2007 ; SSE-NEXT:    retq
   2008 ;
   2009 ; AVX1-LABEL: trunc_mul_v8i32_v8i16:
   2010 ; AVX1:       # %bb.0:
   2011 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm2
   2012 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2013 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2014 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   2015 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2016 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   2017 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
   2018 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2019 ; AVX1-NEXT:    vzeroupper
   2020 ; AVX1-NEXT:    retq
   2021 ;
   2022 ; AVX2-LABEL: trunc_mul_v8i32_v8i16:
   2023 ; AVX2:       # %bb.0:
   2024 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   2025 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2026 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2027 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   2028 ; AVX2-NEXT:    vzeroupper
   2029 ; AVX2-NEXT:    retq
   2030 ;
   2031 ; AVX512-LABEL: trunc_mul_v8i32_v8i16:
   2032 ; AVX512:       # %bb.0:
   2033 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   2034 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   2035 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   2036 ; AVX512-NEXT:    vzeroupper
   2037 ; AVX512-NEXT:    retq
   2038   %1 = mul <8 x i32> %a0, %a1
   2039   %2 = trunc <8 x i32> %1 to <8 x i16>
   2040   ret <8 x i16> %2
   2041 }
   2042 
   2043 define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
   2044 ; SSE-LABEL: trunc_mul_v16i64_v16i8:
   2045 ; SSE:       # %bb.0:
   2046 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   2047 ; SSE-NEXT:    movdqa %xmm0, %xmm9
   2048 ; SSE-NEXT:    psrlq $32, %xmm9
   2049 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2050 ; SSE-NEXT:    movdqa %xmm8, %xmm10
   2051 ; SSE-NEXT:    psrlq $32, %xmm10
   2052 ; SSE-NEXT:    pmuludq %xmm0, %xmm10
   2053 ; SSE-NEXT:    paddq %xmm9, %xmm10
   2054 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
   2055 ; SSE-NEXT:    psllq $32, %xmm10
   2056 ; SSE-NEXT:    pmuludq %xmm8, %xmm0
   2057 ; SSE-NEXT:    paddq %xmm10, %xmm0
   2058 ; SSE-NEXT:    movdqa %xmm1, %xmm8
   2059 ; SSE-NEXT:    psrlq $32, %xmm8
   2060 ; SSE-NEXT:    pmuludq %xmm9, %xmm8
   2061 ; SSE-NEXT:    movdqa %xmm9, %xmm10
   2062 ; SSE-NEXT:    psrlq $32, %xmm10
   2063 ; SSE-NEXT:    pmuludq %xmm1, %xmm10
   2064 ; SSE-NEXT:    paddq %xmm8, %xmm10
   2065 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   2066 ; SSE-NEXT:    psllq $32, %xmm10
   2067 ; SSE-NEXT:    pmuludq %xmm9, %xmm1
   2068 ; SSE-NEXT:    paddq %xmm10, %xmm1
   2069 ; SSE-NEXT:    movdqa %xmm2, %xmm9
   2070 ; SSE-NEXT:    psrlq $32, %xmm9
   2071 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2072 ; SSE-NEXT:    movdqa %xmm8, %xmm10
   2073 ; SSE-NEXT:    psrlq $32, %xmm10
   2074 ; SSE-NEXT:    pmuludq %xmm2, %xmm10
   2075 ; SSE-NEXT:    paddq %xmm9, %xmm10
   2076 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
   2077 ; SSE-NEXT:    psllq $32, %xmm10
   2078 ; SSE-NEXT:    pmuludq %xmm8, %xmm2
   2079 ; SSE-NEXT:    paddq %xmm10, %xmm2
   2080 ; SSE-NEXT:    movdqa %xmm3, %xmm8
   2081 ; SSE-NEXT:    psrlq $32, %xmm8
   2082 ; SSE-NEXT:    pmuludq %xmm9, %xmm8
   2083 ; SSE-NEXT:    movdqa %xmm9, %xmm10
   2084 ; SSE-NEXT:    psrlq $32, %xmm10
   2085 ; SSE-NEXT:    pmuludq %xmm3, %xmm10
   2086 ; SSE-NEXT:    paddq %xmm8, %xmm10
   2087 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   2088 ; SSE-NEXT:    psllq $32, %xmm10
   2089 ; SSE-NEXT:    pmuludq %xmm9, %xmm3
   2090 ; SSE-NEXT:    paddq %xmm10, %xmm3
   2091 ; SSE-NEXT:    movdqa %xmm4, %xmm9
   2092 ; SSE-NEXT:    psrlq $32, %xmm9
   2093 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2094 ; SSE-NEXT:    movdqa %xmm8, %xmm10
   2095 ; SSE-NEXT:    psrlq $32, %xmm10
   2096 ; SSE-NEXT:    pmuludq %xmm4, %xmm10
   2097 ; SSE-NEXT:    paddq %xmm9, %xmm10
   2098 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
   2099 ; SSE-NEXT:    psllq $32, %xmm10
   2100 ; SSE-NEXT:    pmuludq %xmm8, %xmm4
   2101 ; SSE-NEXT:    paddq %xmm10, %xmm4
   2102 ; SSE-NEXT:    movdqa %xmm5, %xmm8
   2103 ; SSE-NEXT:    psrlq $32, %xmm8
   2104 ; SSE-NEXT:    pmuludq %xmm9, %xmm8
   2105 ; SSE-NEXT:    movdqa %xmm9, %xmm10
   2106 ; SSE-NEXT:    psrlq $32, %xmm10
   2107 ; SSE-NEXT:    pmuludq %xmm5, %xmm10
   2108 ; SSE-NEXT:    paddq %xmm8, %xmm10
   2109 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
   2110 ; SSE-NEXT:    psllq $32, %xmm10
   2111 ; SSE-NEXT:    pmuludq %xmm9, %xmm5
   2112 ; SSE-NEXT:    paddq %xmm10, %xmm5
   2113 ; SSE-NEXT:    movdqa %xmm6, %xmm9
   2114 ; SSE-NEXT:    psrlq $32, %xmm9
   2115 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2116 ; SSE-NEXT:    movdqa %xmm8, %xmm10
   2117 ; SSE-NEXT:    psrlq $32, %xmm10
   2118 ; SSE-NEXT:    pmuludq %xmm6, %xmm10
   2119 ; SSE-NEXT:    paddq %xmm9, %xmm10
   2120 ; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
   2121 ; SSE-NEXT:    psllq $32, %xmm10
   2122 ; SSE-NEXT:    pmuludq %xmm8, %xmm6
   2123 ; SSE-NEXT:    paddq %xmm10, %xmm6
   2124 ; SSE-NEXT:    movdqa %xmm7, %xmm8
   2125 ; SSE-NEXT:    psrlq $32, %xmm8
   2126 ; SSE-NEXT:    pmuludq %xmm9, %xmm8
   2127 ; SSE-NEXT:    movdqa %xmm9, %xmm10
   2128 ; SSE-NEXT:    psrlq $32, %xmm10
   2129 ; SSE-NEXT:    pmuludq %xmm7, %xmm10
   2130 ; SSE-NEXT:    paddq %xmm8, %xmm10
   2131 ; SSE-NEXT:    pmuludq %xmm9, %xmm7
   2132 ; SSE-NEXT:    psllq $32, %xmm10
   2133 ; SSE-NEXT:    paddq %xmm10, %xmm7
   2134 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   2135 ; SSE-NEXT:    pand %xmm8, %xmm7
   2136 ; SSE-NEXT:    pand %xmm8, %xmm6
   2137 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   2138 ; SSE-NEXT:    pand %xmm8, %xmm5
   2139 ; SSE-NEXT:    pand %xmm8, %xmm4
   2140 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   2141 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   2142 ; SSE-NEXT:    pand %xmm8, %xmm3
   2143 ; SSE-NEXT:    pand %xmm8, %xmm2
   2144 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   2145 ; SSE-NEXT:    pand %xmm8, %xmm1
   2146 ; SSE-NEXT:    pand %xmm8, %xmm0
   2147 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   2148 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   2149 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   2150 ; SSE-NEXT:    retq
   2151 ;
   2152 ; AVX1-LABEL: trunc_mul_v16i64_v16i8:
   2153 ; AVX1:       # %bb.0:
   2154 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm8
   2155 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm8, %xmm8
   2156 ; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm9
   2157 ; AVX1-NEXT:    vpmuludq %xmm9, %xmm0, %xmm9
   2158 ; AVX1-NEXT:    vpaddq %xmm8, %xmm9, %xmm8
   2159 ; AVX1-NEXT:    vpsllq $32, %xmm8, %xmm8
   2160 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm9
   2161 ; AVX1-NEXT:    vpaddq %xmm8, %xmm9, %xmm8
   2162 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm9
   2163 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2164 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
   2165 ; AVX1-NEXT:    vpmuludq %xmm9, %xmm4, %xmm10
   2166 ; AVX1-NEXT:    vpsrlq $32, %xmm9, %xmm4
   2167 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
   2168 ; AVX1-NEXT:    vpaddq %xmm10, %xmm4, %xmm4
   2169 ; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
   2170 ; AVX1-NEXT:    vpmuludq %xmm9, %xmm0, %xmm0
   2171 ; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm9
   2172 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
   2173 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm4, %xmm4
   2174 ; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm0
   2175 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
   2176 ; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
   2177 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
   2178 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm4
   2179 ; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm10
   2180 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm0
   2181 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2182 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm5
   2183 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm5, %xmm5
   2184 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm4
   2185 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
   2186 ; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
   2187 ; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
   2188 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
   2189 ; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm1
   2190 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm0
   2191 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm0
   2192 ; AVX1-NEXT:    vpsrlq $32, %xmm6, %xmm4
   2193 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
   2194 ; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
   2195 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
   2196 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm4
   2197 ; AVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm5
   2198 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm0
   2199 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   2200 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
   2201 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm4, %xmm4
   2202 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
   2203 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm6
   2204 ; AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
   2205 ; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
   2206 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm0
   2207 ; AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
   2208 ; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm2
   2209 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm2, %xmm2
   2210 ; AVX1-NEXT:    vpsrlq $32, %xmm7, %xmm4
   2211 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
   2212 ; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
   2213 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
   2214 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm4
   2215 ; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
   2216 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
   2217 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   2218 ; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm6
   2219 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm6
   2220 ; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm7
   2221 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm3, %xmm7
   2222 ; AVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
   2223 ; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
   2224 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm3
   2225 ; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
   2226 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   2227 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
   2228 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2229 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   2230 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2231 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm3
   2232 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
   2233 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   2234 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   2235 ; AVX1-NEXT:    vpand %xmm4, %xmm10, %xmm2
   2236 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
   2237 ; AVX1-NEXT:    vpand %xmm4, %xmm9, %xmm2
   2238 ; AVX1-NEXT:    vpand %xmm4, %xmm8, %xmm3
   2239 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
   2240 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
   2241 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
   2242 ; AVX1-NEXT:    vzeroupper
   2243 ; AVX1-NEXT:    retq
   2244 ;
   2245 ; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
   2246 ; AVX2-SLOW:       # %bb.0:
   2247 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
   2248 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
   2249 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   2250 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   2251 ; AVX2-SLOW-NEXT:    vpmulld %xmm7, %xmm3, %xmm3
   2252 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
   2253 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
   2254 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   2255 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   2256 ; AVX2-SLOW-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
   2257 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   2258 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2259 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   2260 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   2261 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2262 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
   2263 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
   2264 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
   2265 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   2266 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   2267 ; AVX2-SLOW-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
   2268 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
   2269 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
   2270 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   2271 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2272 ; AVX2-SLOW-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
   2273 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2274 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   2275 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2276 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
   2277 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   2278 ; AVX2-SLOW-NEXT:    vzeroupper
   2279 ; AVX2-SLOW-NEXT:    retq
   2280 ;
   2281 ; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8:
   2282 ; AVX2-FAST:       # %bb.0:
   2283 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
   2284 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm8, %ymm7
   2285 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm8, %ymm3
   2286 ; AVX2-FAST-NEXT:    vpmulld %xmm7, %xmm3, %xmm3
   2287 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm8, %ymm6
   2288 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm8, %ymm2
   2289 ; AVX2-FAST-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
   2290 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   2291 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2292 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   2293 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   2294 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2295 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
   2296 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm8, %ymm5
   2297 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm8, %ymm1
   2298 ; AVX2-FAST-NEXT:    vpmulld %xmm5, %xmm1, %xmm1
   2299 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm8, %ymm4
   2300 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm0
   2301 ; AVX2-FAST-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
   2302 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2303 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   2304 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2305 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
   2306 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   2307 ; AVX2-FAST-NEXT:    vzeroupper
   2308 ; AVX2-FAST-NEXT:    retq
   2309 ;
   2310 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
   2311 ; AVX512F:       # %bb.0:
   2312 ; AVX512F-NEXT:    vpmovqd %zmm3, %ymm3
   2313 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
   2314 ; AVX512F-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
   2315 ; AVX512F-NEXT:    vpmovqd %zmm2, %ymm2
   2316 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
   2317 ; AVX512F-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
   2318 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2319 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   2320 ; AVX512F-NEXT:    vzeroupper
   2321 ; AVX512F-NEXT:    retq
   2322 ;
   2323 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
   2324 ; AVX512BW:       # %bb.0:
   2325 ; AVX512BW-NEXT:    vpmovqd %zmm3, %ymm3
   2326 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
   2327 ; AVX512BW-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
   2328 ; AVX512BW-NEXT:    vpmovqd %zmm2, %ymm2
   2329 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
   2330 ; AVX512BW-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
   2331 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2332 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
   2333 ; AVX512BW-NEXT:    vzeroupper
   2334 ; AVX512BW-NEXT:    retq
   2335 ;
   2336 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
   2337 ; AVX512DQ:       # %bb.0:
   2338 ; AVX512DQ-NEXT:    vpmullq %zmm3, %zmm1, %zmm1
   2339 ; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
   2340 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
   2341 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
   2342 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2343 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2344 ; AVX512DQ-NEXT:    vzeroupper
   2345 ; AVX512DQ-NEXT:    retq
   2346   %1 = mul <16 x i64> %a0, %a1
   2347   %2 = trunc <16 x i64> %1 to <16 x i8>
   2348   ret <16 x i8> %2
   2349 }
   2350 
   2351 define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
   2352 ; SSE-LABEL: trunc_mul_v16i32_v16i8:
   2353 ; SSE:       # %bb.0:
   2354 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
   2355 ; SSE-NEXT:    pmuludq %xmm4, %xmm0
   2356 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2357 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   2358 ; SSE-NEXT:    pmuludq %xmm8, %xmm4
   2359 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
   2360 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
   2361 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
   2362 ; SSE-NEXT:    pmuludq %xmm5, %xmm1
   2363 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   2364 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
   2365 ; SSE-NEXT:    pmuludq %xmm4, %xmm5
   2366 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
   2367 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
   2368 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
   2369 ; SSE-NEXT:    pmuludq %xmm6, %xmm2
   2370 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
   2371 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
   2372 ; SSE-NEXT:    pmuludq %xmm4, %xmm5
   2373 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
   2374 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
   2375 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
   2376 ; SSE-NEXT:    pmuludq %xmm7, %xmm3
   2377 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
   2378 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
   2379 ; SSE-NEXT:    pmuludq %xmm4, %xmm5
   2380 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
   2381 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
   2382 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   2383 ; SSE-NEXT:    pand %xmm4, %xmm3
   2384 ; SSE-NEXT:    pand %xmm4, %xmm2
   2385 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   2386 ; SSE-NEXT:    pand %xmm4, %xmm1
   2387 ; SSE-NEXT:    pand %xmm4, %xmm0
   2388 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   2389 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   2390 ; SSE-NEXT:    retq
   2391 ;
   2392 ; AVX1-LABEL: trunc_mul_v16i32_v16i8:
   2393 ; AVX1:       # %bb.0:
   2394 ; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm4
   2395 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   2396 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2397 ; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
   2398 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm2
   2399 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   2400 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2401 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
   2402 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   2403 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   2404 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   2405 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
   2406 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   2407 ; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
   2408 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
   2409 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   2410 ; AVX1-NEXT:    vzeroupper
   2411 ; AVX1-NEXT:    retq
   2412 ;
   2413 ; AVX2-LABEL: trunc_mul_v16i32_v16i8:
   2414 ; AVX2:       # %bb.0:
   2415 ; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
   2416 ; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
   2417 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2418 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   2419 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   2420 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2421 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   2422 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   2423 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2424 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   2425 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2426 ; AVX2-NEXT:    vzeroupper
   2427 ; AVX2-NEXT:    retq
   2428 ;
   2429 ; AVX512-LABEL: trunc_mul_v16i32_v16i8:
   2430 ; AVX512:       # %bb.0:
   2431 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   2432 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   2433 ; AVX512-NEXT:    vzeroupper
   2434 ; AVX512-NEXT:    retq
   2435   %1 = mul <16 x i32> %a0, %a1
   2436   %2 = trunc <16 x i32> %1 to <16 x i8>
   2437   ret <16 x i8> %2
   2438 }
   2439 
   2440 define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
   2441 ; SSE-LABEL: trunc_mul_v16i16_v16i8:
   2442 ; SSE:       # %bb.0:
   2443 ; SSE-NEXT:    pmullw %xmm2, %xmm0
   2444 ; SSE-NEXT:    pmullw %xmm3, %xmm1
   2445 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   2446 ; SSE-NEXT:    pand %xmm2, %xmm1
   2447 ; SSE-NEXT:    pand %xmm2, %xmm0
   2448 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   2449 ; SSE-NEXT:    retq
   2450 ;
   2451 ; AVX1-LABEL: trunc_mul_v16i16_v16i8:
   2452 ; AVX1:       # %bb.0:
   2453 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
   2454 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2455 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2456 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2457 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2458 ; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   2459 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
   2460 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2461 ; AVX1-NEXT:    vzeroupper
   2462 ; AVX1-NEXT:    retq
   2463 ;
   2464 ; AVX2-LABEL: trunc_mul_v16i16_v16i8:
   2465 ; AVX2:       # %bb.0:
   2466 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2467 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2468 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2469 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2470 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2471 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2472 ; AVX2-NEXT:    vzeroupper
   2473 ; AVX2-NEXT:    retq
   2474 ;
   2475 ; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
   2476 ; AVX512F:       # %bb.0:
   2477 ; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2478 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   2479 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   2480 ; AVX512F-NEXT:    vzeroupper
   2481 ; AVX512F-NEXT:    retq
   2482 ;
   2483 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
   2484 ; AVX512BW:       # %bb.0:
   2485 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2486 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2487 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   2488 ; AVX512BW-NEXT:    vzeroupper
   2489 ; AVX512BW-NEXT:    retq
   2490 ;
   2491 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
   2492 ; AVX512DQ:       # %bb.0:
   2493 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2494 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2495 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2496 ; AVX512DQ-NEXT:    vzeroupper
   2497 ; AVX512DQ-NEXT:    retq
   2498   %1 = mul <16 x i16> %a0, %a1
   2499   %2 = trunc <16 x i16> %1 to <16 x i8>
   2500   ret <16 x i8> %2
   2501 }
   2502 
   2503 define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
   2504 ; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
   2505 ; SSE:       # %bb.0:
   2506 ; SSE-NEXT:    pxor %xmm3, %xmm3
   2507 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
   2508 ; SSE-NEXT:    pslld $16, %xmm2
   2509 ; SSE-NEXT:    psrad $16, %xmm2
   2510 ; SSE-NEXT:    pslld $16, %xmm1
   2511 ; SSE-NEXT:    psrad $16, %xmm1
   2512 ; SSE-NEXT:    packssdw %xmm2, %xmm1
   2513 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   2514 ; SSE-NEXT:    retq
   2515 ;
   2516 ; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
   2517 ; AVX1:       # %bb.0:
   2518 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2519 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2520 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   2521 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   2522 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2523 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2524 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2525 ; AVX1-NEXT:    vzeroupper
   2526 ; AVX1-NEXT:    retq
   2527 ;
   2528 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
   2529 ; AVX2:       # %bb.0:
   2530 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2531 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2532 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   2533 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2534 ; AVX2-NEXT:    vzeroupper
   2535 ; AVX2-NEXT:    retq
   2536 ;
   2537 ; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
   2538 ; AVX512:       # %bb.0:
   2539 ; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
   2540 ; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
   2541 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2542 ; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2543 ; AVX512-NEXT:    vzeroupper
   2544 ; AVX512-NEXT:    retq
   2545   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2546   %2 = zext <8 x i8> %1 to <8 x i32>
   2547   %3 = mul <8 x i32> %2, %a1
   2548   %4 = trunc <8 x i32> %3 to <8 x i16>
   2549   ret <8 x i16> %4
   2550 }
   2551 
   2552 ;
   2553 ; mul to constant
   2554 ;
   2555 
   2556 define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
   2557 ; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
   2558 ; SSE:       # %bb.0:
   2559 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,3]
   2560 ; SSE-NEXT:    movdqa %xmm1, %xmm3
   2561 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
   2562 ; SSE-NEXT:    psrlq $32, %xmm1
   2563 ; SSE-NEXT:    pmuludq %xmm2, %xmm1
   2564 ; SSE-NEXT:    psllq $32, %xmm1
   2565 ; SSE-NEXT:    paddq %xmm3, %xmm1
   2566 ; SSE-NEXT:    movl $1, %eax
   2567 ; SSE-NEXT:    movq %rax, %xmm2
   2568 ; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
   2569 ; SSE-NEXT:    movdqa %xmm0, %xmm3
   2570 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
   2571 ; SSE-NEXT:    psrlq $32, %xmm0
   2572 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
   2573 ; SSE-NEXT:    psllq $32, %xmm0
   2574 ; SSE-NEXT:    paddq %xmm3, %xmm0
   2575 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   2576 ; SSE-NEXT:    retq
   2577 ;
   2578 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
   2579 ; AVX1:       # %bb.0:
   2580 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2581 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   2582 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2583 ; AVX1-NEXT:    vzeroupper
   2584 ; AVX1-NEXT:    retq
   2585 ;
   2586 ; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
   2587 ; AVX2-SLOW:       # %bb.0:
   2588 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   2589 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2590 ; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2591 ; AVX2-SLOW-NEXT:    vzeroupper
   2592 ; AVX2-SLOW-NEXT:    retq
   2593 ;
   2594 ; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32:
   2595 ; AVX2-FAST:       # %bb.0:
   2596 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   2597 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
   2598 ; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2599 ; AVX2-FAST-NEXT:    vzeroupper
   2600 ; AVX2-FAST-NEXT:    retq
   2601 ;
   2602 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
   2603 ; AVX512:       # %bb.0:
   2604 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   2605 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   2606 ; AVX512-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2607 ; AVX512-NEXT:    vzeroupper
   2608 ; AVX512-NEXT:    retq
   2609   %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   2610   %2 = trunc <4 x i64> %1 to <4 x i32>
   2611   ret <4 x i32> %2
   2612 }
   2613 
   2614 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
   2615 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
   2616 ; SSE:       # %bb.0:
   2617 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   2618 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   2619 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2620 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   2621 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   2622 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   2623 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   2624 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   2625 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   2626 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2627 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   2628 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
   2629 ; SSE-NEXT:    retq
   2630 ;
   2631 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
   2632 ; AVX1:       # %bb.0:
   2633 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2634 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
   2635 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   2636 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
   2637 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   2638 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2639 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   2640 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
   2641 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   2642 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   2643 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   2644 ; AVX1-NEXT:    vzeroupper
   2645 ; AVX1-NEXT:    retq
   2646 ;
   2647 ; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
   2648 ; AVX2-SLOW:       # %bb.0:
   2649 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   2650 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2651 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   2652 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   2653 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2654 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2655 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2656 ; AVX2-SLOW-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   2657 ; AVX2-SLOW-NEXT:    vzeroupper
   2658 ; AVX2-SLOW-NEXT:    retq
   2659 ;
   2660 ; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16:
   2661 ; AVX2-FAST:       # %bb.0:
   2662 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   2663 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   2664 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   2665 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2666 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2667 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2668 ; AVX2-FAST-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   2669 ; AVX2-FAST-NEXT:    vzeroupper
   2670 ; AVX2-FAST-NEXT:    retq
   2671 ;
   2672 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
   2673 ; AVX512:       # %bb.0:
   2674 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   2675 ; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   2676 ; AVX512-NEXT:    vzeroupper
   2677 ; AVX512-NEXT:    retq
   2678   %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   2679   %2 = trunc <8 x i64> %1 to <8 x i16>
   2680   ret <8 x i16> %2
   2681 }
   2682 
   2683 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
   2684 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
   2685 ; SSE:       # %bb.0:
   2686 ; SSE-NEXT:    pslld $16, %xmm1
   2687 ; SSE-NEXT:    psrad $16, %xmm1
   2688 ; SSE-NEXT:    pslld $16, %xmm0
   2689 ; SSE-NEXT:    psrad $16, %xmm0
   2690 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   2691 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
   2692 ; SSE-NEXT:    retq
   2693 ;
   2694 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
   2695 ; AVX1:       # %bb.0:
   2696 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2697 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   2698 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2699 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2700 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2701 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   2702 ; AVX1-NEXT:    vzeroupper
   2703 ; AVX1-NEXT:    retq
   2704 ;
   2705 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
   2706 ; AVX2:       # %bb.0:
   2707 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2708 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2709 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   2710 ; AVX2-NEXT:    vzeroupper
   2711 ; AVX2-NEXT:    retq
   2712 ;
   2713 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
   2714 ; AVX512:       # %bb.0:
   2715 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   2716 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   2717 ; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   2718 ; AVX512-NEXT:    vzeroupper
   2719 ; AVX512-NEXT:    retq
   2720   %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2721   %2 = trunc <8 x i32> %1 to <8 x i16>
   2722   ret <8 x i16> %2
   2723 }
   2724 
   2725 define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
   2726 ; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
   2727 ; SSE:       # %bb.0:
   2728 ; SSE-NEXT:    movl $1, %eax
   2729 ; SSE-NEXT:    movq %rax, %xmm8
   2730 ; SSE-NEXT:    pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
   2731 ; SSE-NEXT:    movdqa %xmm0, %xmm9
   2732 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2733 ; SSE-NEXT:    psrlq $32, %xmm0
   2734 ; SSE-NEXT:    pmuludq %xmm8, %xmm0
   2735 ; SSE-NEXT:    psllq $32, %xmm0
   2736 ; SSE-NEXT:    paddq %xmm9, %xmm0
   2737 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [2,3]
   2738 ; SSE-NEXT:    movdqa %xmm1, %xmm9
   2739 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2740 ; SSE-NEXT:    psrlq $32, %xmm1
   2741 ; SSE-NEXT:    pmuludq %xmm8, %xmm1
   2742 ; SSE-NEXT:    psllq $32, %xmm1
   2743 ; SSE-NEXT:    paddq %xmm9, %xmm1
   2744 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [4,5]
   2745 ; SSE-NEXT:    movdqa %xmm2, %xmm9
   2746 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2747 ; SSE-NEXT:    psrlq $32, %xmm2
   2748 ; SSE-NEXT:    pmuludq %xmm8, %xmm2
   2749 ; SSE-NEXT:    psllq $32, %xmm2
   2750 ; SSE-NEXT:    paddq %xmm9, %xmm2
   2751 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [6,7]
   2752 ; SSE-NEXT:    movdqa %xmm3, %xmm9
   2753 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2754 ; SSE-NEXT:    psrlq $32, %xmm3
   2755 ; SSE-NEXT:    pmuludq %xmm8, %xmm3
   2756 ; SSE-NEXT:    psllq $32, %xmm3
   2757 ; SSE-NEXT:    paddq %xmm9, %xmm3
   2758 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [8,9]
   2759 ; SSE-NEXT:    movdqa %xmm4, %xmm9
   2760 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2761 ; SSE-NEXT:    psrlq $32, %xmm4
   2762 ; SSE-NEXT:    pmuludq %xmm8, %xmm4
   2763 ; SSE-NEXT:    psllq $32, %xmm4
   2764 ; SSE-NEXT:    paddq %xmm9, %xmm4
   2765 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [10,11]
   2766 ; SSE-NEXT:    movdqa %xmm5, %xmm9
   2767 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2768 ; SSE-NEXT:    psrlq $32, %xmm5
   2769 ; SSE-NEXT:    pmuludq %xmm8, %xmm5
   2770 ; SSE-NEXT:    psllq $32, %xmm5
   2771 ; SSE-NEXT:    paddq %xmm9, %xmm5
   2772 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [12,13]
   2773 ; SSE-NEXT:    movdqa %xmm6, %xmm9
   2774 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2775 ; SSE-NEXT:    psrlq $32, %xmm6
   2776 ; SSE-NEXT:    pmuludq %xmm8, %xmm6
   2777 ; SSE-NEXT:    psllq $32, %xmm6
   2778 ; SSE-NEXT:    paddq %xmm9, %xmm6
   2779 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [14,15]
   2780 ; SSE-NEXT:    movdqa %xmm7, %xmm9
   2781 ; SSE-NEXT:    pmuludq %xmm8, %xmm9
   2782 ; SSE-NEXT:    psrlq $32, %xmm7
   2783 ; SSE-NEXT:    pmuludq %xmm8, %xmm7
   2784 ; SSE-NEXT:    psllq $32, %xmm7
   2785 ; SSE-NEXT:    paddq %xmm9, %xmm7
   2786 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   2787 ; SSE-NEXT:    pand %xmm8, %xmm7
   2788 ; SSE-NEXT:    pand %xmm8, %xmm6
   2789 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   2790 ; SSE-NEXT:    pand %xmm8, %xmm5
   2791 ; SSE-NEXT:    pand %xmm8, %xmm4
   2792 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   2793 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   2794 ; SSE-NEXT:    pand %xmm8, %xmm3
   2795 ; SSE-NEXT:    pand %xmm8, %xmm2
   2796 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   2797 ; SSE-NEXT:    pand %xmm8, %xmm1
   2798 ; SSE-NEXT:    pand %xmm8, %xmm0
   2799 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   2800 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   2801 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   2802 ; SSE-NEXT:    retq
   2803 ;
   2804 ; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
   2805 ; AVX1:       # %bb.0:
   2806 ; AVX1-NEXT:    movl $1, %eax
   2807 ; AVX1-NEXT:    vmovq %rax, %xmm4
   2808 ; AVX1-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
   2809 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm5
   2810 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
   2811 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
   2812 ; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
   2813 ; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm8
   2814 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2815 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,3]
   2816 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm6
   2817 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
   2818 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm0
   2819 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
   2820 ; AVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm9
   2821 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,5]
   2822 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm6
   2823 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm7
   2824 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
   2825 ; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
   2826 ; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
   2827 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2828 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,7]
   2829 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm7
   2830 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
   2831 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm1, %xmm1
   2832 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
   2833 ; AVX1-NEXT:    vpaddq %xmm1, %xmm7, %xmm1
   2834 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9]
   2835 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
   2836 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
   2837 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
   2838 ; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
   2839 ; AVX1-NEXT:    vpaddq %xmm4, %xmm7, %xmm4
   2840 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
   2841 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [10,11]
   2842 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm7
   2843 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
   2844 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm2, %xmm2
   2845 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
   2846 ; AVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
   2847 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [12,13]
   2848 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
   2849 ; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm0
   2850 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm0
   2851 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
   2852 ; AVX1-NEXT:    vpaddq %xmm0, %xmm7, %xmm0
   2853 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   2854 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [14,15]
   2855 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm7
   2856 ; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm3
   2857 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm3
   2858 ; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
   2859 ; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
   2860 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   2861 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
   2862 ; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
   2863 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   2864 ; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
   2865 ; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm3
   2866 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
   2867 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
   2868 ; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
   2869 ; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm2
   2870 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
   2871 ; AVX1-NEXT:    vpand %xmm6, %xmm9, %xmm2
   2872 ; AVX1-NEXT:    vpand %xmm6, %xmm8, %xmm3
   2873 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2
   2874 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
   2875 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
   2876 ; AVX1-NEXT:    vzeroupper
   2877 ; AVX1-NEXT:    retq
   2878 ;
   2879 ; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
   2880 ; AVX2-SLOW:       # %bb.0:
   2881 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   2882 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   2883 ; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
   2884 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   2885 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   2886 ; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
   2887 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   2888 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2889 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   2890 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   2891 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2892 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   2893 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   2894 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2895 ; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2896 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   2897 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   2898 ; AVX2-SLOW-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
   2899 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2900 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   2901 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2902 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2903 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   2904 ; AVX2-SLOW-NEXT:    vzeroupper
   2905 ; AVX2-SLOW-NEXT:    retq
   2906 ;
   2907 ; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8:
   2908 ; AVX2-FAST:       # %bb.0:
   2909 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   2910 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   2911 ; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm2, %xmm2
   2912 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   2913 ; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
   2914 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   2915 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   2916 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   2917 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   2918 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2919 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   2920 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   2921 ; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2922 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   2923 ; AVX2-FAST-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
   2924 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2925 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   2926 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   2927 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   2928 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   2929 ; AVX2-FAST-NEXT:    vzeroupper
   2930 ; AVX2-FAST-NEXT:    retq
   2931 ;
   2932 ; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
   2933 ; AVX512:       # %bb.0:
   2934 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   2935 ; AVX512-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
   2936 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   2937 ; AVX512-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
   2938 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   2939 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   2940 ; AVX512-NEXT:    vzeroupper
   2941 ; AVX512-NEXT:    retq
   2942   %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   2943   %2 = trunc <16 x i64> %1 to <16 x i8>
   2944   ret <16 x i8> %2
   2945 }
   2946 
   2947 define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
   2948 ; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
   2949 ; SSE:       # %bb.0:
   2950 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,2,3]
   2951 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
   2952 ; SSE-NEXT:    pmuludq %xmm4, %xmm0
   2953 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2954 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   2955 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
   2956 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
   2957 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
   2958 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4,5,6,7]
   2959 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
   2960 ; SSE-NEXT:    pmuludq %xmm4, %xmm1
   2961 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   2962 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   2963 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
   2964 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
   2965 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
   2966 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,9,10,11]
   2967 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
   2968 ; SSE-NEXT:    pmuludq %xmm4, %xmm2
   2969 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
   2970 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   2971 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
   2972 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
   2973 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
   2974 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [12,13,14,15]
   2975 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
   2976 ; SSE-NEXT:    pmuludq %xmm4, %xmm3
   2977 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
   2978 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   2979 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
   2980 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
   2981 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
   2982 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   2983 ; SSE-NEXT:    pand %xmm4, %xmm3
   2984 ; SSE-NEXT:    pand %xmm4, %xmm2
   2985 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   2986 ; SSE-NEXT:    pand %xmm4, %xmm1
   2987 ; SSE-NEXT:    pand %xmm4, %xmm0
   2988 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   2989 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   2990 ; SSE-NEXT:    retq
   2991 ;
   2992 ; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
   2993 ; AVX1:       # %bb.0:
   2994 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm2
   2995 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2996 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2997 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm3
   2998 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2999 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
   3000 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   3001 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   3002 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
   3003 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
   3004 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   3005 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   3006 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
   3007 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   3008 ; AVX1-NEXT:    vzeroupper
   3009 ; AVX1-NEXT:    retq
   3010 ;
   3011 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
   3012 ; AVX2:       # %bb.0:
   3013 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3014 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   3015 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3016 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
   3017 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3018 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   3019 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   3020 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3021 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   3022 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   3023 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3024 ; AVX2-NEXT:    vzeroupper
   3025 ; AVX2-NEXT:    retq
   3026 ;
   3027 ; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
   3028 ; AVX512:       # %bb.0:
   3029 ; AVX512-NEXT:    vpmulld {{.*}}(%rip), %zmm0, %zmm0
   3030 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   3031 ; AVX512-NEXT:    vzeroupper
   3032 ; AVX512-NEXT:    retq
   3033   %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   3034   %2 = trunc <16 x i32> %1 to <16 x i8>
   3035   ret <16 x i8> %2
   3036 }
   3037 
   3038 define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
   3039 ; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
   3040 ; SSE:       # %bb.0:
   3041 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
   3042 ; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
   3043 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   3044 ; SSE-NEXT:    pand %xmm2, %xmm1
   3045 ; SSE-NEXT:    pand %xmm2, %xmm0
   3046 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   3047 ; SSE-NEXT:    retq
   3048 ;
   3049 ; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
   3050 ; AVX1:       # %bb.0:
   3051 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
   3052 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   3053 ; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
   3054 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3055 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3056 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3057 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   3058 ; AVX1-NEXT:    vzeroupper
   3059 ; AVX1-NEXT:    retq
   3060 ;
   3061 ; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
   3062 ; AVX2:       # %bb.0:
   3063 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
   3064 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   3065 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3066 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3067 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3068 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3069 ; AVX2-NEXT:    vzeroupper
   3070 ; AVX2-NEXT:    retq
   3071 ;
   3072 ; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
   3073 ; AVX512F:       # %bb.0:
   3074 ; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
   3075 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   3076 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   3077 ; AVX512F-NEXT:    vzeroupper
   3078 ; AVX512F-NEXT:    retq
   3079 ;
   3080 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
   3081 ; AVX512BW:       # %bb.0:
   3082 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
   3083 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   3084 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3085 ; AVX512BW-NEXT:    vzeroupper
   3086 ; AVX512BW-NEXT:    retq
   3087 ;
   3088 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
   3089 ; AVX512DQ:       # %bb.0:
   3090 ; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
   3091 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3092 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3093 ; AVX512DQ-NEXT:    vzeroupper
   3094 ; AVX512DQ-NEXT:    retq
   3095   %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   3096   %2 = trunc <16 x i16> %1 to <16 x i8>
   3097   ret <16 x i8> %2
   3098 }
   3099 
   3100 ;
   3101 ; and
   3102 ;
   3103 
   3104 define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   3105 ; SSE-LABEL: trunc_and_v4i64_v4i32:
   3106 ; SSE:       # %bb.0:
   3107 ; SSE-NEXT:    andps %xmm3, %xmm1
   3108 ; SSE-NEXT:    andps %xmm2, %xmm0
   3109 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   3110 ; SSE-NEXT:    retq
   3111 ;
   3112 ; AVX1-LABEL: trunc_and_v4i64_v4i32:
   3113 ; AVX1:       # %bb.0:
   3114 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
   3115 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   3116 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   3117 ; AVX1-NEXT:    vzeroupper
   3118 ; AVX1-NEXT:    retq
   3119 ;
   3120 ; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
   3121 ; AVX2-SLOW:       # %bb.0:
   3122 ; AVX2-SLOW-NEXT:    vandps %ymm1, %ymm0, %ymm0
   3123 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3124 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3125 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3126 ; AVX2-SLOW-NEXT:    vzeroupper
   3127 ; AVX2-SLOW-NEXT:    retq
   3128 ;
   3129 ; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32:
   3130 ; AVX2-FAST:       # %bb.0:
   3131 ; AVX2-FAST-NEXT:    vandps %ymm1, %ymm0, %ymm0
   3132 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   3133 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   3134 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3135 ; AVX2-FAST-NEXT:    vzeroupper
   3136 ; AVX2-FAST-NEXT:    retq
   3137 ;
   3138 ; AVX512-LABEL: trunc_and_v4i64_v4i32:
   3139 ; AVX512:       # %bb.0:
   3140 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   3141 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   3142 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3143 ; AVX512-NEXT:    vzeroupper
   3144 ; AVX512-NEXT:    retq
   3145   %1 = and <4 x i64> %a0, %a1
   3146   %2 = trunc <4 x i64> %1 to <4 x i32>
   3147   ret <4 x i32> %2
   3148 }
   3149 
   3150 define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
   3151 ; SSE-LABEL: trunc_and_v8i64_v8i16:
   3152 ; SSE:       # %bb.0:
   3153 ; SSE-NEXT:    pand %xmm6, %xmm2
   3154 ; SSE-NEXT:    pand %xmm7, %xmm3
   3155 ; SSE-NEXT:    pand %xmm4, %xmm0
   3156 ; SSE-NEXT:    pand %xmm5, %xmm1
   3157 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   3158 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   3159 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   3160 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   3161 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   3162 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   3163 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   3164 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   3165 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   3166 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3167 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   3168 ; SSE-NEXT:    retq
   3169 ;
   3170 ; AVX1-LABEL: trunc_and_v8i64_v8i16:
   3171 ; AVX1:       # %bb.0:
   3172 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
   3173 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
   3174 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3175 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   3176 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   3177 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
   3178 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   3179 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3180 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   3181 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
   3182 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   3183 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   3184 ; AVX1-NEXT:    vzeroupper
   3185 ; AVX1-NEXT:    retq
   3186 ;
   3187 ; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
   3188 ; AVX2-SLOW:       # %bb.0:
   3189 ; AVX2-SLOW-NEXT:    vpand %ymm3, %ymm1, %ymm1
   3190 ; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
   3191 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3192 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3193 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   3194 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3195 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3196 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3197 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3198 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3199 ; AVX2-SLOW-NEXT:    vzeroupper
   3200 ; AVX2-SLOW-NEXT:    retq
   3201 ;
   3202 ; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16:
   3203 ; AVX2-FAST:       # %bb.0:
   3204 ; AVX2-FAST-NEXT:    vpand %ymm3, %ymm1, %ymm1
   3205 ; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
   3206 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   3207 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   3208 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   3209 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3210 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3211 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3212 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3213 ; AVX2-FAST-NEXT:    vzeroupper
   3214 ; AVX2-FAST-NEXT:    retq
   3215 ;
   3216 ; AVX512-LABEL: trunc_and_v8i64_v8i16:
   3217 ; AVX512:       # %bb.0:
   3218 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   3219 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   3220 ; AVX512-NEXT:    vzeroupper
   3221 ; AVX512-NEXT:    retq
   3222   %1 = and <8 x i64> %a0, %a1
   3223   %2 = trunc <8 x i64> %1 to <8 x i16>
   3224   ret <8 x i16> %2
   3225 }
   3226 
   3227 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
   3228 ; SSE-LABEL: trunc_and_v8i32_v8i16:
   3229 ; SSE:       # %bb.0:
   3230 ; SSE-NEXT:    pand %xmm2, %xmm0
   3231 ; SSE-NEXT:    pand %xmm3, %xmm1
   3232 ; SSE-NEXT:    pslld $16, %xmm1
   3233 ; SSE-NEXT:    psrad $16, %xmm1
   3234 ; SSE-NEXT:    pslld $16, %xmm0
   3235 ; SSE-NEXT:    psrad $16, %xmm0
   3236 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   3237 ; SSE-NEXT:    retq
   3238 ;
   3239 ; AVX1-LABEL: trunc_and_v8i32_v8i16:
   3240 ; AVX1:       # %bb.0:
   3241 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
   3242 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   3243 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   3244 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3245 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3246 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3247 ; AVX1-NEXT:    vzeroupper
   3248 ; AVX1-NEXT:    retq
   3249 ;
   3250 ; AVX2-LABEL: trunc_and_v8i32_v8i16:
   3251 ; AVX2:       # %bb.0:
   3252 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   3253 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3254 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3255 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3256 ; AVX2-NEXT:    vzeroupper
   3257 ; AVX2-NEXT:    retq
   3258 ;
   3259 ; AVX512-LABEL: trunc_and_v8i32_v8i16:
   3260 ; AVX512:       # %bb.0:
   3261 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   3262 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   3263 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3264 ; AVX512-NEXT:    vzeroupper
   3265 ; AVX512-NEXT:    retq
   3266   %1 = and <8 x i32> %a0, %a1
   3267   %2 = trunc <8 x i32> %1 to <8 x i16>
   3268   ret <8 x i16> %2
   3269 }
   3270 
   3271 define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
   3272 ; SSE-LABEL: trunc_and_v16i64_v16i8:
   3273 ; SSE:       # %bb.0:
   3274 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm0
   3275 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm1
   3276 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm2
   3277 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm3
   3278 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm4
   3279 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm5
   3280 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm6
   3281 ; SSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm7
   3282 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   3283 ; SSE-NEXT:    pand %xmm8, %xmm7
   3284 ; SSE-NEXT:    pand %xmm8, %xmm6
   3285 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   3286 ; SSE-NEXT:    pand %xmm8, %xmm5
   3287 ; SSE-NEXT:    pand %xmm8, %xmm4
   3288 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   3289 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   3290 ; SSE-NEXT:    pand %xmm8, %xmm3
   3291 ; SSE-NEXT:    pand %xmm8, %xmm2
   3292 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   3293 ; SSE-NEXT:    pand %xmm8, %xmm1
   3294 ; SSE-NEXT:    pand %xmm8, %xmm0
   3295 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   3296 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   3297 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   3298 ; SSE-NEXT:    retq
   3299 ;
   3300 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
   3301 ; AVX1:       # %bb.0:
   3302 ; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
   3303 ; AVX1-NEXT:    vandps %ymm5, %ymm1, %ymm1
   3304 ; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
   3305 ; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
   3306 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
   3307 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   3308 ; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
   3309 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   3310 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   3311 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   3312 ; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
   3313 ; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
   3314 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
   3315 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   3316 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   3317 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   3318 ; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
   3319 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   3320 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   3321 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   3322 ; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
   3323 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   3324 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   3325 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   3326 ; AVX1-NEXT:    vzeroupper
   3327 ; AVX1-NEXT:    retq
   3328 ;
   3329 ; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
   3330 ; AVX2-SLOW:       # %bb.0:
   3331 ; AVX2-SLOW-NEXT:    vpand %ymm5, %ymm1, %ymm1
   3332 ; AVX2-SLOW-NEXT:    vpand %ymm4, %ymm0, %ymm0
   3333 ; AVX2-SLOW-NEXT:    vpand %ymm7, %ymm3, %ymm3
   3334 ; AVX2-SLOW-NEXT:    vpand %ymm6, %ymm2, %ymm2
   3335 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   3336 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   3337 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   3338 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   3339 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   3340 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3341 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   3342 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   3343 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3344 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   3345 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3346 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3347 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   3348 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3349 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3350 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   3351 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3352 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   3353 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   3354 ; AVX2-SLOW-NEXT:    vzeroupper
   3355 ; AVX2-SLOW-NEXT:    retq
   3356 ;
   3357 ; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8:
   3358 ; AVX2-FAST:       # %bb.0:
   3359 ; AVX2-FAST-NEXT:    vpand %ymm5, %ymm1, %ymm1
   3360 ; AVX2-FAST-NEXT:    vpand %ymm4, %ymm0, %ymm0
   3361 ; AVX2-FAST-NEXT:    vpand %ymm7, %ymm3, %ymm3
   3362 ; AVX2-FAST-NEXT:    vpand %ymm6, %ymm2, %ymm2
   3363 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   3364 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   3365 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   3366 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   3367 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3368 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   3369 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   3370 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3371 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   3372 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   3373 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   3374 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3375 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   3376 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3377 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   3378 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   3379 ; AVX2-FAST-NEXT:    vzeroupper
   3380 ; AVX2-FAST-NEXT:    retq
   3381 ;
   3382 ; AVX512-LABEL: trunc_and_v16i64_v16i8:
   3383 ; AVX512:       # %bb.0:
   3384 ; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
   3385 ; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
   3386 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   3387 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   3388 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   3389 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   3390 ; AVX512-NEXT:    vzeroupper
   3391 ; AVX512-NEXT:    retq
   3392   %1 = and <16 x i64> %a0, %a1
   3393   %2 = trunc <16 x i64> %1 to <16 x i8>
   3394   ret <16 x i8> %2
   3395 }
   3396 
   3397 define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
   3398 ; SSE-LABEL: trunc_and_v16i32_v16i8:
   3399 ; SSE:       # %bb.0:
   3400 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   3401 ; SSE-NEXT:    pand %xmm8, %xmm7
   3402 ; SSE-NEXT:    pand %xmm3, %xmm7
   3403 ; SSE-NEXT:    pand %xmm8, %xmm6
   3404 ; SSE-NEXT:    pand %xmm2, %xmm6
   3405 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   3406 ; SSE-NEXT:    pand %xmm8, %xmm5
   3407 ; SSE-NEXT:    pand %xmm1, %xmm5
   3408 ; SSE-NEXT:    pand %xmm8, %xmm4
   3409 ; SSE-NEXT:    pand %xmm4, %xmm0
   3410 ; SSE-NEXT:    packuswb %xmm5, %xmm0
   3411 ; SSE-NEXT:    packuswb %xmm6, %xmm0
   3412 ; SSE-NEXT:    retq
   3413 ;
   3414 ; AVX1-LABEL: trunc_and_v16i32_v16i8:
   3415 ; AVX1:       # %bb.0:
   3416 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
   3417 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
   3418 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3419 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   3420 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
   3421 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
   3422 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   3423 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3424 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
   3425 ; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
   3426 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   3427 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   3428 ; AVX1-NEXT:    vzeroupper
   3429 ; AVX1-NEXT:    retq
   3430 ;
   3431 ; AVX2-LABEL: trunc_and_v16i32_v16i8:
   3432 ; AVX2:       # %bb.0:
   3433 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
   3434 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
   3435 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3436 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   3437 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3438 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3439 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   3440 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   3441 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3442 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   3443 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3444 ; AVX2-NEXT:    vzeroupper
   3445 ; AVX2-NEXT:    retq
   3446 ;
   3447 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
   3448 ; AVX512:       # %bb.0:
   3449 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   3450 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   3451 ; AVX512-NEXT:    vzeroupper
   3452 ; AVX512-NEXT:    retq
   3453   %1 = and <16 x i32> %a0, %a1
   3454   %2 = trunc <16 x i32> %1 to <16 x i8>
   3455   ret <16 x i8> %2
   3456 }
   3457 
   3458 define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
   3459 ; SSE-LABEL: trunc_and_v16i16_v16i8:
   3460 ; SSE:       # %bb.0:
   3461 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
   3462 ; SSE-NEXT:    pand %xmm4, %xmm3
   3463 ; SSE-NEXT:    pand %xmm1, %xmm3
   3464 ; SSE-NEXT:    pand %xmm4, %xmm2
   3465 ; SSE-NEXT:    pand %xmm2, %xmm0
   3466 ; SSE-NEXT:    packuswb %xmm3, %xmm0
   3467 ; SSE-NEXT:    retq
   3468 ;
   3469 ; AVX1-LABEL: trunc_and_v16i16_v16i8:
   3470 ; AVX1:       # %bb.0:
   3471 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
   3472 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   3473 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3474 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3475 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3476 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3477 ; AVX1-NEXT:    vzeroupper
   3478 ; AVX1-NEXT:    retq
   3479 ;
   3480 ; AVX2-LABEL: trunc_and_v16i16_v16i8:
   3481 ; AVX2:       # %bb.0:
   3482 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   3483 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   3484 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3485 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3486 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3487 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3488 ; AVX2-NEXT:    vzeroupper
   3489 ; AVX2-NEXT:    retq
   3490 ;
   3491 ; AVX512F-LABEL: trunc_and_v16i16_v16i8:
   3492 ; AVX512F:       # %bb.0:
   3493 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
   3494 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   3495 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   3496 ; AVX512F-NEXT:    vzeroupper
   3497 ; AVX512F-NEXT:    retq
   3498 ;
   3499 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
   3500 ; AVX512BW:       # %bb.0:
   3501 ; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
   3502 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   3503 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3504 ; AVX512BW-NEXT:    vzeroupper
   3505 ; AVX512BW-NEXT:    retq
   3506 ;
   3507 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
   3508 ; AVX512DQ:       # %bb.0:
   3509 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
   3510 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3511 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3512 ; AVX512DQ-NEXT:    vzeroupper
   3513 ; AVX512DQ-NEXT:    retq
   3514   %1 = and <16 x i16> %a0, %a1
   3515   %2 = trunc <16 x i16> %1 to <16 x i8>
   3516   ret <16 x i8> %2
   3517 }
   3518 
   3519 ;
   3520 ; and to constant
   3521 ;
   3522 
   3523 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
   3524 ; SSE-LABEL: trunc_and_const_v4i64_v4i32:
   3525 ; SSE:       # %bb.0:
   3526 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   3527 ; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
   3528 ; SSE-NEXT:    retq
   3529 ;
   3530 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
   3531 ; AVX1:       # %bb.0:
   3532 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   3533 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   3534 ; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
   3535 ; AVX1-NEXT:    vzeroupper
   3536 ; AVX1-NEXT:    retq
   3537 ;
   3538 ; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
   3539 ; AVX2-SLOW:       # %bb.0:
   3540 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3541 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3542 ; AVX2-SLOW-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
   3543 ; AVX2-SLOW-NEXT:    vzeroupper
   3544 ; AVX2-SLOW-NEXT:    retq
   3545 ;
   3546 ; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32:
   3547 ; AVX2-FAST:       # %bb.0:
   3548 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   3549 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   3550 ; AVX2-FAST-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
   3551 ; AVX2-FAST-NEXT:    vzeroupper
   3552 ; AVX2-FAST-NEXT:    retq
   3553 ;
   3554 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
   3555 ; AVX512:       # %bb.0:
   3556 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   3557 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   3558 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3559 ; AVX512-NEXT:    vzeroupper
   3560 ; AVX512-NEXT:    retq
   3561   %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   3562   %2 = trunc <4 x i64> %1 to <4 x i32>
   3563   ret <4 x i32> %2
   3564 }
   3565 
   3566 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
   3567 ; SSE-LABEL: trunc_and_const_v8i64_v8i16:
   3568 ; SSE:       # %bb.0:
   3569 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   3570 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   3571 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   3572 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   3573 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   3574 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   3575 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   3576 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   3577 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   3578 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3579 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   3580 ; SSE-NEXT:    andpd {{.*}}(%rip), %xmm0
   3581 ; SSE-NEXT:    retq
   3582 ;
   3583 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
   3584 ; AVX1:       # %bb.0:
   3585 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3586 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
   3587 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   3588 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
   3589 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   3590 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3591 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   3592 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
   3593 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   3594 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   3595 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3596 ; AVX1-NEXT:    vzeroupper
   3597 ; AVX1-NEXT:    retq
   3598 ;
   3599 ; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
   3600 ; AVX2-SLOW:       # %bb.0:
   3601 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3602 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3603 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   3604 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3605 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3606 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3607 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3608 ; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3609 ; AVX2-SLOW-NEXT:    vzeroupper
   3610 ; AVX2-SLOW-NEXT:    retq
   3611 ;
   3612 ; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16:
   3613 ; AVX2-FAST:       # %bb.0:
   3614 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   3615 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   3616 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   3617 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3618 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3619 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3620 ; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3621 ; AVX2-FAST-NEXT:    vzeroupper
   3622 ; AVX2-FAST-NEXT:    retq
   3623 ;
   3624 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
   3625 ; AVX512:       # %bb.0:
   3626 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   3627 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3628 ; AVX512-NEXT:    vzeroupper
   3629 ; AVX512-NEXT:    retq
   3630   %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   3631   %2 = trunc <8 x i64> %1 to <8 x i16>
   3632   ret <8 x i16> %2
   3633 }
   3634 
   3635 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
   3636 ; SSE-LABEL: trunc_and_const_v8i32_v8i16:
   3637 ; SSE:       # %bb.0:
   3638 ; SSE-NEXT:    pslld $16, %xmm1
   3639 ; SSE-NEXT:    psrad $16, %xmm1
   3640 ; SSE-NEXT:    pslld $16, %xmm0
   3641 ; SSE-NEXT:    psrad $16, %xmm0
   3642 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   3643 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   3644 ; SSE-NEXT:    retq
   3645 ;
   3646 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
   3647 ; AVX1:       # %bb.0:
   3648 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   3649 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   3650 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3651 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3652 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3653 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3654 ; AVX1-NEXT:    vzeroupper
   3655 ; AVX1-NEXT:    retq
   3656 ;
   3657 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
   3658 ; AVX2:       # %bb.0:
   3659 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3660 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3661 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3662 ; AVX2-NEXT:    vzeroupper
   3663 ; AVX2-NEXT:    retq
   3664 ;
   3665 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
   3666 ; AVX512:       # %bb.0:
   3667 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   3668 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   3669 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3670 ; AVX512-NEXT:    vzeroupper
   3671 ; AVX512-NEXT:    retq
   3672   %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3673   %2 = trunc <8 x i32> %1 to <8 x i16>
   3674   ret <8 x i16> %2
   3675 }
   3676 
   3677 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
   3678 ; SSE-LABEL: trunc_and_const_v16i64_v16i8:
   3679 ; SSE:       # %bb.0:
   3680 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   3681 ; SSE-NEXT:    pand %xmm8, %xmm7
   3682 ; SSE-NEXT:    pand %xmm8, %xmm6
   3683 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   3684 ; SSE-NEXT:    pand %xmm8, %xmm5
   3685 ; SSE-NEXT:    pand %xmm8, %xmm4
   3686 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   3687 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   3688 ; SSE-NEXT:    pand %xmm8, %xmm3
   3689 ; SSE-NEXT:    pand %xmm8, %xmm2
   3690 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   3691 ; SSE-NEXT:    pand %xmm8, %xmm1
   3692 ; SSE-NEXT:    pand %xmm8, %xmm0
   3693 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   3694 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   3695 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   3696 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   3697 ; SSE-NEXT:    retq
   3698 ;
   3699 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
   3700 ; AVX1:       # %bb.0:
   3701 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
   3702 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   3703 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
   3704 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   3705 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   3706 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   3707 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
   3708 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
   3709 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
   3710 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   3711 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   3712 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   3713 ; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
   3714 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   3715 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   3716 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   3717 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
   3718 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   3719 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   3720 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   3721 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3722 ; AVX1-NEXT:    vzeroupper
   3723 ; AVX1-NEXT:    retq
   3724 ;
   3725 ; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
   3726 ; AVX2-SLOW:       # %bb.0:
   3727 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   3728 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   3729 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   3730 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   3731 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   3732 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3733 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   3734 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   3735 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3736 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   3737 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3738 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3739 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   3740 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3741 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3742 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   3743 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3744 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   3745 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   3746 ; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3747 ; AVX2-SLOW-NEXT:    vzeroupper
   3748 ; AVX2-SLOW-NEXT:    retq
   3749 ;
   3750 ; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8:
   3751 ; AVX2-FAST:       # %bb.0:
   3752 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   3753 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   3754 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   3755 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   3756 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3757 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   3758 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   3759 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3760 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   3761 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   3762 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   3763 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3764 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   3765 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3766 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   3767 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   3768 ; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3769 ; AVX2-FAST-NEXT:    vzeroupper
   3770 ; AVX2-FAST-NEXT:    retq
   3771 ;
   3772 ; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
   3773 ; AVX512:       # %bb.0:
   3774 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   3775 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   3776 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   3777 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   3778 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3779 ; AVX512-NEXT:    vzeroupper
   3780 ; AVX512-NEXT:    retq
   3781   %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   3782   %2 = trunc <16 x i64> %1 to <16 x i8>
   3783   ret <16 x i8> %2
   3784 }
   3785 
   3786 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
   3787 ; SSE-LABEL: trunc_and_const_v16i32_v16i8:
   3788 ; SSE:       # %bb.0:
   3789 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   3790 ; SSE-NEXT:    pand %xmm4, %xmm3
   3791 ; SSE-NEXT:    pand %xmm4, %xmm2
   3792 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   3793 ; SSE-NEXT:    pand %xmm4, %xmm1
   3794 ; SSE-NEXT:    pand %xmm4, %xmm0
   3795 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   3796 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   3797 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   3798 ; SSE-NEXT:    retq
   3799 ;
   3800 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
   3801 ; AVX1:       # %bb.0:
   3802 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3803 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   3804 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   3805 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   3806 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   3807 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3808 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   3809 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   3810 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   3811 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   3812 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3813 ; AVX1-NEXT:    vzeroupper
   3814 ; AVX1-NEXT:    retq
   3815 ;
   3816 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
   3817 ; AVX2:       # %bb.0:
   3818 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3819 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   3820 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3821 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3822 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   3823 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   3824 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3825 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   3826 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3827 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3828 ; AVX2-NEXT:    vzeroupper
   3829 ; AVX2-NEXT:    retq
   3830 ;
   3831 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
   3832 ; AVX512:       # %bb.0:
   3833 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   3834 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3835 ; AVX512-NEXT:    vzeroupper
   3836 ; AVX512-NEXT:    retq
   3837   %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   3838   %2 = trunc <16 x i32> %1 to <16 x i8>
   3839   ret <16 x i8> %2
   3840 }
   3841 
   3842 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
   3843 ; SSE-LABEL: trunc_and_const_v16i16_v16i8:
   3844 ; SSE:       # %bb.0:
   3845 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   3846 ; SSE-NEXT:    pand %xmm2, %xmm1
   3847 ; SSE-NEXT:    pand %xmm2, %xmm0
   3848 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   3849 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
   3850 ; SSE-NEXT:    retq
   3851 ;
   3852 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
   3853 ; AVX1:       # %bb.0:
   3854 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   3855 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3856 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3857 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3858 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3859 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3860 ; AVX1-NEXT:    vzeroupper
   3861 ; AVX1-NEXT:    retq
   3862 ;
   3863 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
   3864 ; AVX2:       # %bb.0:
   3865 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   3866 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   3867 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   3868 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   3869 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   3870 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3871 ; AVX2-NEXT:    vzeroupper
   3872 ; AVX2-NEXT:    retq
   3873 ;
   3874 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
   3875 ; AVX512F:       # %bb.0:
   3876 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   3877 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   3878 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3879 ; AVX512F-NEXT:    vzeroupper
   3880 ; AVX512F-NEXT:    retq
   3881 ;
   3882 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
   3883 ; AVX512BW:       # %bb.0:
   3884 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   3885 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   3886 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3887 ; AVX512BW-NEXT:    vzeroupper
   3888 ; AVX512BW-NEXT:    retq
   3889 ;
   3890 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
   3891 ; AVX512DQ:       # %bb.0:
   3892 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3893 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3894 ; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
   3895 ; AVX512DQ-NEXT:    vzeroupper
   3896 ; AVX512DQ-NEXT:    retq
   3897   %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   3898   %2 = trunc <16 x i16> %1 to <16 x i8>
   3899   ret <16 x i8> %2
   3900 }
   3901 
   3902 ;
   3903 ; xor
   3904 ;
   3905 
   3906 define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   3907 ; SSE-LABEL: trunc_xor_v4i64_v4i32:
   3908 ; SSE:       # %bb.0:
   3909 ; SSE-NEXT:    xorps %xmm3, %xmm1
   3910 ; SSE-NEXT:    xorps %xmm2, %xmm0
   3911 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   3912 ; SSE-NEXT:    retq
   3913 ;
   3914 ; AVX1-LABEL: trunc_xor_v4i64_v4i32:
   3915 ; AVX1:       # %bb.0:
   3916 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   3917 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   3918 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   3919 ; AVX1-NEXT:    vzeroupper
   3920 ; AVX1-NEXT:    retq
   3921 ;
   3922 ; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
   3923 ; AVX2-SLOW:       # %bb.0:
   3924 ; AVX2-SLOW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   3925 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3926 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3927 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3928 ; AVX2-SLOW-NEXT:    vzeroupper
   3929 ; AVX2-SLOW-NEXT:    retq
   3930 ;
   3931 ; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32:
   3932 ; AVX2-FAST:       # %bb.0:
   3933 ; AVX2-FAST-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   3934 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   3935 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   3936 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3937 ; AVX2-FAST-NEXT:    vzeroupper
   3938 ; AVX2-FAST-NEXT:    retq
   3939 ;
   3940 ; AVX512-LABEL: trunc_xor_v4i64_v4i32:
   3941 ; AVX512:       # %bb.0:
   3942 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
   3943 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   3944 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   3945 ; AVX512-NEXT:    vzeroupper
   3946 ; AVX512-NEXT:    retq
   3947   %1 = xor <4 x i64> %a0, %a1
   3948   %2 = trunc <4 x i64> %1 to <4 x i32>
   3949   ret <4 x i32> %2
   3950 }
   3951 
   3952 define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
   3953 ; SSE-LABEL: trunc_xor_v8i64_v8i16:
   3954 ; SSE:       # %bb.0:
   3955 ; SSE-NEXT:    pxor %xmm6, %xmm2
   3956 ; SSE-NEXT:    pxor %xmm7, %xmm3
   3957 ; SSE-NEXT:    pxor %xmm4, %xmm0
   3958 ; SSE-NEXT:    pxor %xmm5, %xmm1
   3959 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   3960 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   3961 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   3962 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   3963 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   3964 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   3965 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   3966 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   3967 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   3968 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3969 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   3970 ; SSE-NEXT:    retq
   3971 ;
   3972 ; AVX1-LABEL: trunc_xor_v8i64_v8i16:
   3973 ; AVX1:       # %bb.0:
   3974 ; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
   3975 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
   3976 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3977 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   3978 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   3979 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
   3980 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   3981 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3982 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   3983 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
   3984 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   3985 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   3986 ; AVX1-NEXT:    vzeroupper
   3987 ; AVX1-NEXT:    retq
   3988 ;
   3989 ; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
   3990 ; AVX2-SLOW:       # %bb.0:
   3991 ; AVX2-SLOW-NEXT:    vpxor %ymm3, %ymm1, %ymm1
   3992 ; AVX2-SLOW-NEXT:    vpxor %ymm2, %ymm0, %ymm0
   3993 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   3994 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   3995 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   3996 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   3997 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   3998 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   3999 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4000 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4001 ; AVX2-SLOW-NEXT:    vzeroupper
   4002 ; AVX2-SLOW-NEXT:    retq
   4003 ;
   4004 ; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16:
   4005 ; AVX2-FAST:       # %bb.0:
   4006 ; AVX2-FAST-NEXT:    vpxor %ymm3, %ymm1, %ymm1
   4007 ; AVX2-FAST-NEXT:    vpxor %ymm2, %ymm0, %ymm0
   4008 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   4009 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   4010 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   4011 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4012 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4013 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4014 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4015 ; AVX2-FAST-NEXT:    vzeroupper
   4016 ; AVX2-FAST-NEXT:    retq
   4017 ;
   4018 ; AVX512-LABEL: trunc_xor_v8i64_v8i16:
   4019 ; AVX512:       # %bb.0:
   4020 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
   4021 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   4022 ; AVX512-NEXT:    vzeroupper
   4023 ; AVX512-NEXT:    retq
   4024   %1 = xor <8 x i64> %a0, %a1
   4025   %2 = trunc <8 x i64> %1 to <8 x i16>
   4026   ret <8 x i16> %2
   4027 }
   4028 
   4029 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
   4030 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
   4031 ; SSE:       # %bb.0:
   4032 ; SSE-NEXT:    pxor %xmm2, %xmm0
   4033 ; SSE-NEXT:    pxor %xmm3, %xmm1
   4034 ; SSE-NEXT:    pslld $16, %xmm1
   4035 ; SSE-NEXT:    psrad $16, %xmm1
   4036 ; SSE-NEXT:    pslld $16, %xmm0
   4037 ; SSE-NEXT:    psrad $16, %xmm0
   4038 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   4039 ; SSE-NEXT:    retq
   4040 ;
   4041 ; AVX1-LABEL: trunc_xor_v8i32_v8i16:
   4042 ; AVX1:       # %bb.0:
   4043 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   4044 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   4045 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   4046 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   4047 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   4048 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4049 ; AVX1-NEXT:    vzeroupper
   4050 ; AVX1-NEXT:    retq
   4051 ;
   4052 ; AVX2-LABEL: trunc_xor_v8i32_v8i16:
   4053 ; AVX2:       # %bb.0:
   4054 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
   4055 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4056 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4057 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4058 ; AVX2-NEXT:    vzeroupper
   4059 ; AVX2-NEXT:    retq
   4060 ;
   4061 ; AVX512-LABEL: trunc_xor_v8i32_v8i16:
   4062 ; AVX512:       # %bb.0:
   4063 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
   4064 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   4065 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4066 ; AVX512-NEXT:    vzeroupper
   4067 ; AVX512-NEXT:    retq
   4068   %1 = xor <8 x i32> %a0, %a1
   4069   %2 = trunc <8 x i32> %1 to <8 x i16>
   4070   ret <8 x i16> %2
   4071 }
   4072 
   4073 define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
   4074 ; SSE-LABEL: trunc_xor_v16i64_v16i8:
   4075 ; SSE:       # %bb.0:
   4076 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm0
   4077 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm1
   4078 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm2
   4079 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm3
   4080 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm4
   4081 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm5
   4082 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm6
   4083 ; SSE-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm7
   4084 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   4085 ; SSE-NEXT:    pand %xmm8, %xmm7
   4086 ; SSE-NEXT:    pand %xmm8, %xmm6
   4087 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   4088 ; SSE-NEXT:    pand %xmm8, %xmm5
   4089 ; SSE-NEXT:    pand %xmm8, %xmm4
   4090 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   4091 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   4092 ; SSE-NEXT:    pand %xmm8, %xmm3
   4093 ; SSE-NEXT:    pand %xmm8, %xmm2
   4094 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   4095 ; SSE-NEXT:    pand %xmm8, %xmm1
   4096 ; SSE-NEXT:    pand %xmm8, %xmm0
   4097 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   4098 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   4099 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   4100 ; SSE-NEXT:    retq
   4101 ;
   4102 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
   4103 ; AVX1:       # %bb.0:
   4104 ; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
   4105 ; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
   4106 ; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
   4107 ; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
   4108 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
   4109 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   4110 ; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
   4111 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   4112 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   4113 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   4114 ; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
   4115 ; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
   4116 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
   4117 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   4118 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   4119 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   4120 ; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
   4121 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   4122 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   4123 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   4124 ; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
   4125 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   4126 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   4127 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   4128 ; AVX1-NEXT:    vzeroupper
   4129 ; AVX1-NEXT:    retq
   4130 ;
   4131 ; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
   4132 ; AVX2-SLOW:       # %bb.0:
   4133 ; AVX2-SLOW-NEXT:    vpxor %ymm5, %ymm1, %ymm1
   4134 ; AVX2-SLOW-NEXT:    vpxor %ymm4, %ymm0, %ymm0
   4135 ; AVX2-SLOW-NEXT:    vpxor %ymm7, %ymm3, %ymm3
   4136 ; AVX2-SLOW-NEXT:    vpxor %ymm6, %ymm2, %ymm2
   4137 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   4138 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4139 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   4140 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   4141 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   4142 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4143 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   4144 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4145 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4146 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   4147 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   4148 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4149 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   4150 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   4151 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4152 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   4153 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4154 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   4155 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   4156 ; AVX2-SLOW-NEXT:    vzeroupper
   4157 ; AVX2-SLOW-NEXT:    retq
   4158 ;
   4159 ; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8:
   4160 ; AVX2-FAST:       # %bb.0:
   4161 ; AVX2-FAST-NEXT:    vpxor %ymm5, %ymm1, %ymm1
   4162 ; AVX2-FAST-NEXT:    vpxor %ymm4, %ymm0, %ymm0
   4163 ; AVX2-FAST-NEXT:    vpxor %ymm7, %ymm3, %ymm3
   4164 ; AVX2-FAST-NEXT:    vpxor %ymm6, %ymm2, %ymm2
   4165 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   4166 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   4167 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   4168 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   4169 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4170 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   4171 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4172 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4173 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   4174 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   4175 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   4176 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4177 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   4178 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4179 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   4180 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   4181 ; AVX2-FAST-NEXT:    vzeroupper
   4182 ; AVX2-FAST-NEXT:    retq
   4183 ;
   4184 ; AVX512-LABEL: trunc_xor_v16i64_v16i8:
   4185 ; AVX512:       # %bb.0:
   4186 ; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
   4187 ; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
   4188 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   4189 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   4190 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   4191 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   4192 ; AVX512-NEXT:    vzeroupper
   4193 ; AVX512-NEXT:    retq
   4194   %1 = xor <16 x i64> %a0, %a1
   4195   %2 = trunc <16 x i64> %1 to <16 x i8>
   4196   ret <16 x i8> %2
   4197 }
   4198 
   4199 define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
   4200 ; SSE-LABEL: trunc_xor_v16i32_v16i8:
   4201 ; SSE:       # %bb.0:
   4202 ; SSE-NEXT:    pxor %xmm4, %xmm0
   4203 ; SSE-NEXT:    pxor %xmm5, %xmm1
   4204 ; SSE-NEXT:    pxor %xmm6, %xmm2
   4205 ; SSE-NEXT:    pxor %xmm7, %xmm3
   4206 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   4207 ; SSE-NEXT:    pand %xmm4, %xmm3
   4208 ; SSE-NEXT:    pand %xmm4, %xmm2
   4209 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   4210 ; SSE-NEXT:    pand %xmm4, %xmm1
   4211 ; SSE-NEXT:    pand %xmm4, %xmm0
   4212 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   4213 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   4214 ; SSE-NEXT:    retq
   4215 ;
   4216 ; AVX1-LABEL: trunc_xor_v16i32_v16i8:
   4217 ; AVX1:       # %bb.0:
   4218 ; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
   4219 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
   4220 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   4221 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   4222 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
   4223 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
   4224 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   4225 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   4226 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
   4227 ; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
   4228 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   4229 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   4230 ; AVX1-NEXT:    vzeroupper
   4231 ; AVX1-NEXT:    retq
   4232 ;
   4233 ; AVX2-LABEL: trunc_xor_v16i32_v16i8:
   4234 ; AVX2:       # %bb.0:
   4235 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
   4236 ; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
   4237 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4238 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   4239 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   4240 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4241 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   4242 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   4243 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4244 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   4245 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4246 ; AVX2-NEXT:    vzeroupper
   4247 ; AVX2-NEXT:    retq
   4248 ;
   4249 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
   4250 ; AVX512:       # %bb.0:
   4251 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
   4252 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   4253 ; AVX512-NEXT:    vzeroupper
   4254 ; AVX512-NEXT:    retq
   4255   %1 = xor <16 x i32> %a0, %a1
   4256   %2 = trunc <16 x i32> %1 to <16 x i8>
   4257   ret <16 x i8> %2
   4258 }
   4259 
   4260 define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
   4261 ; SSE-LABEL: trunc_xor_v16i16_v16i8:
   4262 ; SSE:       # %bb.0:
   4263 ; SSE-NEXT:    pxor %xmm2, %xmm0
   4264 ; SSE-NEXT:    pxor %xmm3, %xmm1
   4265 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   4266 ; SSE-NEXT:    pand %xmm2, %xmm1
   4267 ; SSE-NEXT:    pand %xmm2, %xmm0
   4268 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   4269 ; SSE-NEXT:    retq
   4270 ;
   4271 ; AVX1-LABEL: trunc_xor_v16i16_v16i8:
   4272 ; AVX1:       # %bb.0:
   4273 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   4274 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   4275 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4276 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   4277 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   4278 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4279 ; AVX1-NEXT:    vzeroupper
   4280 ; AVX1-NEXT:    retq
   4281 ;
   4282 ; AVX2-LABEL: trunc_xor_v16i16_v16i8:
   4283 ; AVX2:       # %bb.0:
   4284 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
   4285 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   4286 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4287 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   4288 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   4289 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4290 ; AVX2-NEXT:    vzeroupper
   4291 ; AVX2-NEXT:    retq
   4292 ;
   4293 ; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
   4294 ; AVX512F:       # %bb.0:
   4295 ; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
   4296 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   4297 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   4298 ; AVX512F-NEXT:    vzeroupper
   4299 ; AVX512F-NEXT:    retq
   4300 ;
   4301 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
   4302 ; AVX512BW:       # %bb.0:
   4303 ; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
   4304 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   4305 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4306 ; AVX512BW-NEXT:    vzeroupper
   4307 ; AVX512BW-NEXT:    retq
   4308 ;
   4309 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
   4310 ; AVX512DQ:       # %bb.0:
   4311 ; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
   4312 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   4313 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   4314 ; AVX512DQ-NEXT:    vzeroupper
   4315 ; AVX512DQ-NEXT:    retq
   4316   %1 = xor <16 x i16> %a0, %a1
   4317   %2 = trunc <16 x i16> %1 to <16 x i8>
   4318   ret <16 x i8> %2
   4319 }
   4320 
   4321 ;
   4322 ; xor to constant
   4323 ;
   4324 
   4325 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
   4326 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
   4327 ; SSE:       # %bb.0:
   4328 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   4329 ; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
   4330 ; SSE-NEXT:    retq
   4331 ;
   4332 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
   4333 ; AVX1:       # %bb.0:
   4334 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   4335 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   4336 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
   4337 ; AVX1-NEXT:    vzeroupper
   4338 ; AVX1-NEXT:    retq
   4339 ;
   4340 ; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
   4341 ; AVX2-SLOW:       # %bb.0:
   4342 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   4343 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4344 ; AVX2-SLOW-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
   4345 ; AVX2-SLOW-NEXT:    vzeroupper
   4346 ; AVX2-SLOW-NEXT:    retq
   4347 ;
   4348 ; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32:
   4349 ; AVX2-FAST:       # %bb.0:
   4350 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   4351 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   4352 ; AVX2-FAST-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
   4353 ; AVX2-FAST-NEXT:    vzeroupper
   4354 ; AVX2-FAST-NEXT:    retq
   4355 ;
   4356 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
   4357 ; AVX512:       # %bb.0:
   4358 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   4359 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   4360 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4361 ; AVX512-NEXT:    vzeroupper
   4362 ; AVX512-NEXT:    retq
   4363   %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   4364   %2 = trunc <4 x i64> %1 to <4 x i32>
   4365   ret <4 x i32> %2
   4366 }
   4367 
   4368 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
   4369 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
   4370 ; SSE:       # %bb.0:
   4371 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   4372 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   4373 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   4374 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   4375 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   4376 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   4377 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   4378 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   4379 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   4380 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   4381 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   4382 ; SSE-NEXT:    xorpd {{.*}}(%rip), %xmm0
   4383 ; SSE-NEXT:    retq
   4384 ;
   4385 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
   4386 ; AVX1:       # %bb.0:
   4387 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   4388 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
   4389 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   4390 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
   4391 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   4392 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   4393 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   4394 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
   4395 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   4396 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   4397 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4398 ; AVX1-NEXT:    vzeroupper
   4399 ; AVX1-NEXT:    retq
   4400 ;
   4401 ; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
   4402 ; AVX2-SLOW:       # %bb.0:
   4403 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   4404 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4405 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   4406 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   4407 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4408 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4409 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4410 ; AVX2-SLOW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4411 ; AVX2-SLOW-NEXT:    vzeroupper
   4412 ; AVX2-SLOW-NEXT:    retq
   4413 ;
   4414 ; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16:
   4415 ; AVX2-FAST:       # %bb.0:
   4416 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   4417 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   4418 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   4419 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4420 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4421 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4422 ; AVX2-FAST-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4423 ; AVX2-FAST-NEXT:    vzeroupper
   4424 ; AVX2-FAST-NEXT:    retq
   4425 ;
   4426 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
   4427 ; AVX512:       # %bb.0:
   4428 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   4429 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4430 ; AVX512-NEXT:    vzeroupper
   4431 ; AVX512-NEXT:    retq
   4432   %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   4433   %2 = trunc <8 x i64> %1 to <8 x i16>
   4434   ret <8 x i16> %2
   4435 }
   4436 
   4437 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
   4438 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
   4439 ; SSE:       # %bb.0:
   4440 ; SSE-NEXT:    pslld $16, %xmm1
   4441 ; SSE-NEXT:    psrad $16, %xmm1
   4442 ; SSE-NEXT:    pslld $16, %xmm0
   4443 ; SSE-NEXT:    psrad $16, %xmm0
   4444 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   4445 ; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
   4446 ; SSE-NEXT:    retq
   4447 ;
   4448 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
   4449 ; AVX1:       # %bb.0:
   4450 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   4451 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   4452 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   4453 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   4454 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4455 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4456 ; AVX1-NEXT:    vzeroupper
   4457 ; AVX1-NEXT:    retq
   4458 ;
   4459 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
   4460 ; AVX2:       # %bb.0:
   4461 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4462 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4463 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4464 ; AVX2-NEXT:    vzeroupper
   4465 ; AVX2-NEXT:    retq
   4466 ;
   4467 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
   4468 ; AVX512:       # %bb.0:
   4469 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   4470 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   4471 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4472 ; AVX512-NEXT:    vzeroupper
   4473 ; AVX512-NEXT:    retq
   4474   %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   4475   %2 = trunc <8 x i32> %1 to <8 x i16>
   4476   ret <8 x i16> %2
   4477 }
   4478 
   4479 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
   4480 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
   4481 ; SSE:       # %bb.0:
   4482 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   4483 ; SSE-NEXT:    pand %xmm8, %xmm7
   4484 ; SSE-NEXT:    pand %xmm8, %xmm6
   4485 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   4486 ; SSE-NEXT:    pand %xmm8, %xmm5
   4487 ; SSE-NEXT:    pand %xmm8, %xmm4
   4488 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   4489 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   4490 ; SSE-NEXT:    pand %xmm8, %xmm3
   4491 ; SSE-NEXT:    pand %xmm8, %xmm2
   4492 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   4493 ; SSE-NEXT:    pand %xmm8, %xmm1
   4494 ; SSE-NEXT:    pand %xmm8, %xmm0
   4495 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   4496 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   4497 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   4498 ; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
   4499 ; SSE-NEXT:    retq
   4500 ;
   4501 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
   4502 ; AVX1:       # %bb.0:
   4503 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
   4504 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   4505 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
   4506 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   4507 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   4508 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   4509 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
   4510 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
   4511 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
   4512 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   4513 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   4514 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   4515 ; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
   4516 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   4517 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   4518 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   4519 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
   4520 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   4521 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   4522 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   4523 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4524 ; AVX1-NEXT:    vzeroupper
   4525 ; AVX1-NEXT:    retq
   4526 ;
   4527 ; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
   4528 ; AVX2-SLOW:       # %bb.0:
   4529 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   4530 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4531 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   4532 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   4533 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   4534 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4535 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   4536 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4537 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4538 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   4539 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   4540 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4541 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   4542 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   4543 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4544 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   4545 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4546 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   4547 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   4548 ; AVX2-SLOW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4549 ; AVX2-SLOW-NEXT:    vzeroupper
   4550 ; AVX2-SLOW-NEXT:    retq
   4551 ;
   4552 ; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8:
   4553 ; AVX2-FAST:       # %bb.0:
   4554 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   4555 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   4556 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   4557 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   4558 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4559 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   4560 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4561 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4562 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   4563 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   4564 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   4565 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4566 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   4567 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4568 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   4569 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   4570 ; AVX2-FAST-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4571 ; AVX2-FAST-NEXT:    vzeroupper
   4572 ; AVX2-FAST-NEXT:    retq
   4573 ;
   4574 ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
   4575 ; AVX512:       # %bb.0:
   4576 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   4577 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   4578 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   4579 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   4580 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4581 ; AVX512-NEXT:    vzeroupper
   4582 ; AVX512-NEXT:    retq
   4583   %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   4584   %2 = trunc <16 x i64> %1 to <16 x i8>
   4585   ret <16 x i8> %2
   4586 }
   4587 
   4588 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
   4589 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
   4590 ; SSE:       # %bb.0:
   4591 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   4592 ; SSE-NEXT:    pand %xmm4, %xmm3
   4593 ; SSE-NEXT:    pand %xmm4, %xmm2
   4594 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   4595 ; SSE-NEXT:    pand %xmm4, %xmm1
   4596 ; SSE-NEXT:    pand %xmm4, %xmm0
   4597 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   4598 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   4599 ; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
   4600 ; SSE-NEXT:    retq
   4601 ;
   4602 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
   4603 ; AVX1:       # %bb.0:
   4604 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   4605 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   4606 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   4607 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   4608 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   4609 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   4610 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   4611 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   4612 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   4613 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   4614 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4615 ; AVX1-NEXT:    vzeroupper
   4616 ; AVX1-NEXT:    retq
   4617 ;
   4618 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
   4619 ; AVX2:       # %bb.0:
   4620 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4621 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   4622 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   4623 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4624 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   4625 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   4626 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4627 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   4628 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4629 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4630 ; AVX2-NEXT:    vzeroupper
   4631 ; AVX2-NEXT:    retq
   4632 ;
   4633 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
   4634 ; AVX512:       # %bb.0:
   4635 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   4636 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4637 ; AVX512-NEXT:    vzeroupper
   4638 ; AVX512-NEXT:    retq
   4639   %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   4640   %2 = trunc <16 x i32> %1 to <16 x i8>
   4641   ret <16 x i8> %2
   4642 }
   4643 
   4644 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
   4645 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
   4646 ; SSE:       # %bb.0:
   4647 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   4648 ; SSE-NEXT:    pand %xmm2, %xmm1
   4649 ; SSE-NEXT:    pand %xmm2, %xmm0
   4650 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   4651 ; SSE-NEXT:    pxor {{.*}}(%rip), %xmm0
   4652 ; SSE-NEXT:    retq
   4653 ;
   4654 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
   4655 ; AVX1:       # %bb.0:
   4656 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   4657 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4658 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   4659 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   4660 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4661 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4662 ; AVX1-NEXT:    vzeroupper
   4663 ; AVX1-NEXT:    retq
   4664 ;
   4665 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
   4666 ; AVX2:       # %bb.0:
   4667 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   4668 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4669 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   4670 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   4671 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4672 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4673 ; AVX2-NEXT:    vzeroupper
   4674 ; AVX2-NEXT:    retq
   4675 ;
   4676 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
   4677 ; AVX512F:       # %bb.0:
   4678 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   4679 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   4680 ; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4681 ; AVX512F-NEXT:    vzeroupper
   4682 ; AVX512F-NEXT:    retq
   4683 ;
   4684 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
   4685 ; AVX512BW:       # %bb.0:
   4686 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   4687 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   4688 ; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4689 ; AVX512BW-NEXT:    vzeroupper
   4690 ; AVX512BW-NEXT:    retq
   4691 ;
   4692 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
   4693 ; AVX512DQ:       # %bb.0:
   4694 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   4695 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   4696 ; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
   4697 ; AVX512DQ-NEXT:    vzeroupper
   4698 ; AVX512DQ-NEXT:    retq
   4699   %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   4700   %2 = trunc <16 x i16> %1 to <16 x i8>
   4701   ret <16 x i8> %2
   4702 }
   4703 
   4704 ;
   4705 ; or
   4706 ;
   4707 
   4708 define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   4709 ; SSE-LABEL: trunc_or_v4i64_v4i32:
   4710 ; SSE:       # %bb.0:
   4711 ; SSE-NEXT:    orps %xmm3, %xmm1
   4712 ; SSE-NEXT:    orps %xmm2, %xmm0
   4713 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   4714 ; SSE-NEXT:    retq
   4715 ;
   4716 ; AVX1-LABEL: trunc_or_v4i64_v4i32:
   4717 ; AVX1:       # %bb.0:
   4718 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
   4719 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   4720 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   4721 ; AVX1-NEXT:    vzeroupper
   4722 ; AVX1-NEXT:    retq
   4723 ;
   4724 ; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
   4725 ; AVX2-SLOW:       # %bb.0:
   4726 ; AVX2-SLOW-NEXT:    vorps %ymm1, %ymm0, %ymm0
   4727 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   4728 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4729 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4730 ; AVX2-SLOW-NEXT:    vzeroupper
   4731 ; AVX2-SLOW-NEXT:    retq
   4732 ;
   4733 ; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32:
   4734 ; AVX2-FAST:       # %bb.0:
   4735 ; AVX2-FAST-NEXT:    vorps %ymm1, %ymm0, %ymm0
   4736 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   4737 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   4738 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4739 ; AVX2-FAST-NEXT:    vzeroupper
   4740 ; AVX2-FAST-NEXT:    retq
   4741 ;
   4742 ; AVX512-LABEL: trunc_or_v4i64_v4i32:
   4743 ; AVX512:       # %bb.0:
   4744 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
   4745 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   4746 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4747 ; AVX512-NEXT:    vzeroupper
   4748 ; AVX512-NEXT:    retq
   4749   %1 = or <4 x i64> %a0, %a1
   4750   %2 = trunc <4 x i64> %1 to <4 x i32>
   4751   ret <4 x i32> %2
   4752 }
   4753 
   4754 define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
   4755 ; SSE-LABEL: trunc_or_v8i64_v8i16:
   4756 ; SSE:       # %bb.0:
   4757 ; SSE-NEXT:    por %xmm6, %xmm2
   4758 ; SSE-NEXT:    por %xmm7, %xmm3
   4759 ; SSE-NEXT:    por %xmm4, %xmm0
   4760 ; SSE-NEXT:    por %xmm5, %xmm1
   4761 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   4762 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   4763 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   4764 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   4765 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   4766 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   4767 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   4768 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   4769 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   4770 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   4771 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   4772 ; SSE-NEXT:    retq
   4773 ;
   4774 ; AVX1-LABEL: trunc_or_v8i64_v8i16:
   4775 ; AVX1:       # %bb.0:
   4776 ; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
   4777 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
   4778 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   4779 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
   4780 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   4781 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
   4782 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   4783 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   4784 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   4785 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
   4786 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   4787 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   4788 ; AVX1-NEXT:    vzeroupper
   4789 ; AVX1-NEXT:    retq
   4790 ;
   4791 ; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
   4792 ; AVX2-SLOW:       # %bb.0:
   4793 ; AVX2-SLOW-NEXT:    vpor %ymm3, %ymm1, %ymm1
   4794 ; AVX2-SLOW-NEXT:    vpor %ymm2, %ymm0, %ymm0
   4795 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   4796 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4797 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   4798 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   4799 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4800 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4801 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4802 ; AVX2-SLOW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4803 ; AVX2-SLOW-NEXT:    vzeroupper
   4804 ; AVX2-SLOW-NEXT:    retq
   4805 ;
   4806 ; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16:
   4807 ; AVX2-FAST:       # %bb.0:
   4808 ; AVX2-FAST-NEXT:    vpor %ymm3, %ymm1, %ymm1
   4809 ; AVX2-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
   4810 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   4811 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   4812 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   4813 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4814 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4815 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4816 ; AVX2-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4817 ; AVX2-FAST-NEXT:    vzeroupper
   4818 ; AVX2-FAST-NEXT:    retq
   4819 ;
   4820 ; AVX512-LABEL: trunc_or_v8i64_v8i16:
   4821 ; AVX512:       # %bb.0:
   4822 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
   4823 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   4824 ; AVX512-NEXT:    vzeroupper
   4825 ; AVX512-NEXT:    retq
   4826   %1 = or <8 x i64> %a0, %a1
   4827   %2 = trunc <8 x i64> %1 to <8 x i16>
   4828   ret <8 x i16> %2
   4829 }
   4830 
   4831 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
   4832 ; SSE-LABEL: trunc_or_v8i32_v8i16:
   4833 ; SSE:       # %bb.0:
   4834 ; SSE-NEXT:    por %xmm2, %xmm0
   4835 ; SSE-NEXT:    por %xmm3, %xmm1
   4836 ; SSE-NEXT:    pslld $16, %xmm1
   4837 ; SSE-NEXT:    psrad $16, %xmm1
   4838 ; SSE-NEXT:    pslld $16, %xmm0
   4839 ; SSE-NEXT:    psrad $16, %xmm0
   4840 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   4841 ; SSE-NEXT:    retq
   4842 ;
   4843 ; AVX1-LABEL: trunc_or_v8i32_v8i16:
   4844 ; AVX1:       # %bb.0:
   4845 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
   4846 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   4847 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   4848 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   4849 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   4850 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   4851 ; AVX1-NEXT:    vzeroupper
   4852 ; AVX1-NEXT:    retq
   4853 ;
   4854 ; AVX2-LABEL: trunc_or_v8i32_v8i16:
   4855 ; AVX2:       # %bb.0:
   4856 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
   4857 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4858 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4859 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4860 ; AVX2-NEXT:    vzeroupper
   4861 ; AVX2-NEXT:    retq
   4862 ;
   4863 ; AVX512-LABEL: trunc_or_v8i32_v8i16:
   4864 ; AVX512:       # %bb.0:
   4865 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
   4866 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   4867 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   4868 ; AVX512-NEXT:    vzeroupper
   4869 ; AVX512-NEXT:    retq
   4870   %1 = or <8 x i32> %a0, %a1
   4871   %2 = trunc <8 x i32> %1 to <8 x i16>
   4872   ret <8 x i16> %2
   4873 }
   4874 
   4875 define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
   4876 ; SSE-LABEL: trunc_or_v16i64_v16i8:
   4877 ; SSE:       # %bb.0:
   4878 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
   4879 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
   4880 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm2
   4881 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm3
   4882 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm4
   4883 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm5
   4884 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm6
   4885 ; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm7
   4886 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   4887 ; SSE-NEXT:    pand %xmm8, %xmm7
   4888 ; SSE-NEXT:    pand %xmm8, %xmm6
   4889 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   4890 ; SSE-NEXT:    pand %xmm8, %xmm5
   4891 ; SSE-NEXT:    pand %xmm8, %xmm4
   4892 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   4893 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   4894 ; SSE-NEXT:    pand %xmm8, %xmm3
   4895 ; SSE-NEXT:    pand %xmm8, %xmm2
   4896 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   4897 ; SSE-NEXT:    pand %xmm8, %xmm1
   4898 ; SSE-NEXT:    pand %xmm8, %xmm0
   4899 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   4900 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   4901 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   4902 ; SSE-NEXT:    retq
   4903 ;
   4904 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
   4905 ; AVX1:       # %bb.0:
   4906 ; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
   4907 ; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
   4908 ; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
   4909 ; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
   4910 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
   4911 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   4912 ; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
   4913 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   4914 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   4915 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   4916 ; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
   4917 ; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
   4918 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
   4919 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   4920 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   4921 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   4922 ; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
   4923 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   4924 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   4925 ; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
   4926 ; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
   4927 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   4928 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   4929 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   4930 ; AVX1-NEXT:    vzeroupper
   4931 ; AVX1-NEXT:    retq
   4932 ;
   4933 ; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
   4934 ; AVX2-SLOW:       # %bb.0:
   4935 ; AVX2-SLOW-NEXT:    vpor %ymm5, %ymm1, %ymm1
   4936 ; AVX2-SLOW-NEXT:    vpor %ymm4, %ymm0, %ymm0
   4937 ; AVX2-SLOW-NEXT:    vpor %ymm7, %ymm3, %ymm3
   4938 ; AVX2-SLOW-NEXT:    vpor %ymm6, %ymm2, %ymm2
   4939 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   4940 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4941 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   4942 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   4943 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   4944 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4945 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   4946 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4947 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4948 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   4949 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   4950 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4951 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   4952 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   4953 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4954 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   4955 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4956 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   4957 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   4958 ; AVX2-SLOW-NEXT:    vzeroupper
   4959 ; AVX2-SLOW-NEXT:    retq
   4960 ;
   4961 ; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8:
   4962 ; AVX2-FAST:       # %bb.0:
   4963 ; AVX2-FAST-NEXT:    vpor %ymm5, %ymm1, %ymm1
   4964 ; AVX2-FAST-NEXT:    vpor %ymm4, %ymm0, %ymm0
   4965 ; AVX2-FAST-NEXT:    vpor %ymm7, %ymm3, %ymm3
   4966 ; AVX2-FAST-NEXT:    vpor %ymm6, %ymm2, %ymm2
   4967 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   4968 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   4969 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   4970 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   4971 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   4972 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   4973 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   4974 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   4975 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   4976 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   4977 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   4978 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   4979 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   4980 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   4981 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   4982 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   4983 ; AVX2-FAST-NEXT:    vzeroupper
   4984 ; AVX2-FAST-NEXT:    retq
   4985 ;
   4986 ; AVX512-LABEL: trunc_or_v16i64_v16i8:
   4987 ; AVX512:       # %bb.0:
   4988 ; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
   4989 ; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
   4990 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   4991 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   4992 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   4993 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   4994 ; AVX512-NEXT:    vzeroupper
   4995 ; AVX512-NEXT:    retq
   4996   %1 = or <16 x i64> %a0, %a1
   4997   %2 = trunc <16 x i64> %1 to <16 x i8>
   4998   ret <16 x i8> %2
   4999 }
   5000 
   5001 define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
   5002 ; SSE-LABEL: trunc_or_v16i32_v16i8:
   5003 ; SSE:       # %bb.0:
   5004 ; SSE-NEXT:    por %xmm4, %xmm0
   5005 ; SSE-NEXT:    por %xmm5, %xmm1
   5006 ; SSE-NEXT:    por %xmm6, %xmm2
   5007 ; SSE-NEXT:    por %xmm7, %xmm3
   5008 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   5009 ; SSE-NEXT:    pand %xmm4, %xmm3
   5010 ; SSE-NEXT:    pand %xmm4, %xmm2
   5011 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   5012 ; SSE-NEXT:    pand %xmm4, %xmm1
   5013 ; SSE-NEXT:    pand %xmm4, %xmm0
   5014 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   5015 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   5016 ; SSE-NEXT:    retq
   5017 ;
   5018 ; AVX1-LABEL: trunc_or_v16i32_v16i8:
   5019 ; AVX1:       # %bb.0:
   5020 ; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
   5021 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
   5022 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   5023 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   5024 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
   5025 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
   5026 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   5027 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   5028 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
   5029 ; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
   5030 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   5031 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   5032 ; AVX1-NEXT:    vzeroupper
   5033 ; AVX1-NEXT:    retq
   5034 ;
   5035 ; AVX2-LABEL: trunc_or_v16i32_v16i8:
   5036 ; AVX2:       # %bb.0:
   5037 ; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
   5038 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
   5039 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   5040 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   5041 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   5042 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5043 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   5044 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   5045 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5046 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   5047 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   5048 ; AVX2-NEXT:    vzeroupper
   5049 ; AVX2-NEXT:    retq
   5050 ;
   5051 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
   5052 ; AVX512:       # %bb.0:
   5053 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
   5054 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   5055 ; AVX512-NEXT:    vzeroupper
   5056 ; AVX512-NEXT:    retq
   5057   %1 = or <16 x i32> %a0, %a1
   5058   %2 = trunc <16 x i32> %1 to <16 x i8>
   5059   ret <16 x i8> %2
   5060 }
   5061 
   5062 define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
   5063 ; SSE-LABEL: trunc_or_v16i16_v16i8:
   5064 ; SSE:       # %bb.0:
   5065 ; SSE-NEXT:    por %xmm2, %xmm0
   5066 ; SSE-NEXT:    por %xmm3, %xmm1
   5067 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   5068 ; SSE-NEXT:    pand %xmm2, %xmm1
   5069 ; SSE-NEXT:    pand %xmm2, %xmm0
   5070 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   5071 ; SSE-NEXT:    retq
   5072 ;
   5073 ; AVX1-LABEL: trunc_or_v16i16_v16i8:
   5074 ; AVX1:       # %bb.0:
   5075 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
   5076 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   5077 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5078 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   5079 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   5080 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   5081 ; AVX1-NEXT:    vzeroupper
   5082 ; AVX1-NEXT:    retq
   5083 ;
   5084 ; AVX2-LABEL: trunc_or_v16i16_v16i8:
   5085 ; AVX2:       # %bb.0:
   5086 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
   5087 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   5088 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5089 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   5090 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   5091 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   5092 ; AVX2-NEXT:    vzeroupper
   5093 ; AVX2-NEXT:    retq
   5094 ;
   5095 ; AVX512F-LABEL: trunc_or_v16i16_v16i8:
   5096 ; AVX512F:       # %bb.0:
   5097 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
   5098 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   5099 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   5100 ; AVX512F-NEXT:    vzeroupper
   5101 ; AVX512F-NEXT:    retq
   5102 ;
   5103 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
   5104 ; AVX512BW:       # %bb.0:
   5105 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
   5106 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   5107 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
   5108 ; AVX512BW-NEXT:    vzeroupper
   5109 ; AVX512BW-NEXT:    retq
   5110 ;
   5111 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
   5112 ; AVX512DQ:       # %bb.0:
   5113 ; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
   5114 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   5115 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   5116 ; AVX512DQ-NEXT:    vzeroupper
   5117 ; AVX512DQ-NEXT:    retq
   5118   %1 = or <16 x i16> %a0, %a1
   5119   %2 = trunc <16 x i16> %1 to <16 x i8>
   5120   ret <16 x i8> %2
   5121 }
   5122 
   5123 ;
   5124 ; or to constant
   5125 ;
   5126 
   5127 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
   5128 ; SSE-LABEL: trunc_or_const_v4i64_v4i32:
   5129 ; SSE:       # %bb.0:
   5130 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   5131 ; SSE-NEXT:    orps {{.*}}(%rip), %xmm0
   5132 ; SSE-NEXT:    retq
   5133 ;
   5134 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
   5135 ; AVX1:       # %bb.0:
   5136 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   5137 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
   5138 ; AVX1-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
   5139 ; AVX1-NEXT:    vzeroupper
   5140 ; AVX1-NEXT:    retq
   5141 ;
   5142 ; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
   5143 ; AVX2-SLOW:       # %bb.0:
   5144 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   5145 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5146 ; AVX2-SLOW-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
   5147 ; AVX2-SLOW-NEXT:    vzeroupper
   5148 ; AVX2-SLOW-NEXT:    retq
   5149 ;
   5150 ; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32:
   5151 ; AVX2-FAST:       # %bb.0:
   5152 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
   5153 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   5154 ; AVX2-FAST-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0
   5155 ; AVX2-FAST-NEXT:    vzeroupper
   5156 ; AVX2-FAST-NEXT:    retq
   5157 ;
   5158 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
   5159 ; AVX512:       # %bb.0:
   5160 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   5161 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   5162 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5163 ; AVX512-NEXT:    vzeroupper
   5164 ; AVX512-NEXT:    retq
   5165   %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   5166   %2 = trunc <4 x i64> %1 to <4 x i32>
   5167   ret <4 x i32> %2
   5168 }
   5169 
   5170 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
   5171 ; SSE-LABEL: trunc_or_const_v8i64_v8i16:
   5172 ; SSE:       # %bb.0:
   5173 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   5174 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
   5175 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   5176 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
   5177 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   5178 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
   5179 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
   5180 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
   5181 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
   5182 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   5183 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
   5184 ; SSE-NEXT:    orpd {{.*}}(%rip), %xmm0
   5185 ; SSE-NEXT:    retq
   5186 ;
   5187 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
   5188 ; AVX1:       # %bb.0:
   5189 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   5190 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
   5191 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   5192 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
   5193 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   5194 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   5195 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
   5196 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
   5197 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   5198 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   5199 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5200 ; AVX1-NEXT:    vzeroupper
   5201 ; AVX1-NEXT:    retq
   5202 ;
   5203 ; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
   5204 ; AVX2-SLOW:       # %bb.0:
   5205 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   5206 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5207 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   5208 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   5209 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   5210 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   5211 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5212 ; AVX2-SLOW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5213 ; AVX2-SLOW-NEXT:    vzeroupper
   5214 ; AVX2-SLOW-NEXT:    retq
   5215 ;
   5216 ; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16:
   5217 ; AVX2-FAST:       # %bb.0:
   5218 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
   5219 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
   5220 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
   5221 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   5222 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   5223 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5224 ; AVX2-FAST-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5225 ; AVX2-FAST-NEXT:    vzeroupper
   5226 ; AVX2-FAST-NEXT:    retq
   5227 ;
   5228 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
   5229 ; AVX512:       # %bb.0:
   5230 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
   5231 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5232 ; AVX512-NEXT:    vzeroupper
   5233 ; AVX512-NEXT:    retq
   5234   %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   5235   %2 = trunc <8 x i64> %1 to <8 x i16>
   5236   ret <8 x i16> %2
   5237 }
   5238 
   5239 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
   5240 ; SSE-LABEL: trunc_or_const_v8i32_v8i16:
   5241 ; SSE:       # %bb.0:
   5242 ; SSE-NEXT:    pslld $16, %xmm1
   5243 ; SSE-NEXT:    psrad $16, %xmm1
   5244 ; SSE-NEXT:    pslld $16, %xmm0
   5245 ; SSE-NEXT:    psrad $16, %xmm0
   5246 ; SSE-NEXT:    packssdw %xmm1, %xmm0
   5247 ; SSE-NEXT:    por {{.*}}(%rip), %xmm0
   5248 ; SSE-NEXT:    retq
   5249 ;
   5250 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
   5251 ; AVX1:       # %bb.0:
   5252 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   5253 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
   5254 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   5255 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   5256 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   5257 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5258 ; AVX1-NEXT:    vzeroupper
   5259 ; AVX1-NEXT:    retq
   5260 ;
   5261 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
   5262 ; AVX2:       # %bb.0:
   5263 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   5264 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5265 ; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5266 ; AVX2-NEXT:    vzeroupper
   5267 ; AVX2-NEXT:    retq
   5268 ;
   5269 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
   5270 ; AVX512:       # %bb.0:
   5271 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   5272 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
   5273 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5274 ; AVX512-NEXT:    vzeroupper
   5275 ; AVX512-NEXT:    retq
   5276   %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   5277   %2 = trunc <8 x i32> %1 to <8 x i16>
   5278   ret <8 x i16> %2
   5279 }
   5280 
   5281 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
   5282 ; SSE-LABEL: trunc_or_const_v16i64_v16i8:
   5283 ; SSE:       # %bb.0:
   5284 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   5285 ; SSE-NEXT:    pand %xmm8, %xmm7
   5286 ; SSE-NEXT:    pand %xmm8, %xmm6
   5287 ; SSE-NEXT:    packuswb %xmm7, %xmm6
   5288 ; SSE-NEXT:    pand %xmm8, %xmm5
   5289 ; SSE-NEXT:    pand %xmm8, %xmm4
   5290 ; SSE-NEXT:    packuswb %xmm5, %xmm4
   5291 ; SSE-NEXT:    packuswb %xmm6, %xmm4
   5292 ; SSE-NEXT:    pand %xmm8, %xmm3
   5293 ; SSE-NEXT:    pand %xmm8, %xmm2
   5294 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   5295 ; SSE-NEXT:    pand %xmm8, %xmm1
   5296 ; SSE-NEXT:    pand %xmm8, %xmm0
   5297 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   5298 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   5299 ; SSE-NEXT:    packuswb %xmm4, %xmm0
   5300 ; SSE-NEXT:    por {{.*}}(%rip), %xmm0
   5301 ; SSE-NEXT:    retq
   5302 ;
   5303 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
   5304 ; AVX1:       # %bb.0:
   5305 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
   5306 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   5307 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
   5308 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   5309 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
   5310 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
   5311 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
   5312 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
   5313 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
   5314 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
   5315 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
   5316 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   5317 ; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
   5318 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
   5319 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   5320 ; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
   5321 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
   5322 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
   5323 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
   5324 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   5325 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5326 ; AVX1-NEXT:    vzeroupper
   5327 ; AVX1-NEXT:    retq
   5328 ;
   5329 ; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
   5330 ; AVX2-SLOW:       # %bb.0:
   5331 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
   5332 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   5333 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
   5334 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
   5335 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   5336 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   5337 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   5338 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   5339 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5340 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   5341 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
   5342 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5343 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
   5344 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   5345 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   5346 ; AVX2-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   5347 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5348 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   5349 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   5350 ; AVX2-SLOW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5351 ; AVX2-SLOW-NEXT:    vzeroupper
   5352 ; AVX2-SLOW-NEXT:    retq
   5353 ;
   5354 ; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8:
   5355 ; AVX2-FAST:       # %bb.0:
   5356 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
   5357 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
   5358 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
   5359 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
   5360 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   5361 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
   5362 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
   5363 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5364 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
   5365 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
   5366 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
   5367 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   5368 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
   5369 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5370 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
   5371 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   5372 ; AVX2-FAST-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5373 ; AVX2-FAST-NEXT:    vzeroupper
   5374 ; AVX2-FAST-NEXT:    retq
   5375 ;
   5376 ; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
   5377 ; AVX512:       # %bb.0:
   5378 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
   5379 ; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
   5380 ; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
   5381 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   5382 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5383 ; AVX512-NEXT:    vzeroupper
   5384 ; AVX512-NEXT:    retq
   5385   %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   5386   %2 = trunc <16 x i64> %1 to <16 x i8>
   5387   ret <16 x i8> %2
   5388 }
   5389 
   5390 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
   5391 ; SSE-LABEL: trunc_or_const_v16i32_v16i8:
   5392 ; SSE:       # %bb.0:
   5393 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   5394 ; SSE-NEXT:    pand %xmm4, %xmm3
   5395 ; SSE-NEXT:    pand %xmm4, %xmm2
   5396 ; SSE-NEXT:    packuswb %xmm3, %xmm2
   5397 ; SSE-NEXT:    pand %xmm4, %xmm1
   5398 ; SSE-NEXT:    pand %xmm4, %xmm0
   5399 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   5400 ; SSE-NEXT:    packuswb %xmm2, %xmm0
   5401 ; SSE-NEXT:    por {{.*}}(%rip), %xmm0
   5402 ; SSE-NEXT:    retq
   5403 ;
   5404 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
   5405 ; AVX1:       # %bb.0:
   5406 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   5407 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
   5408 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   5409 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   5410 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
   5411 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   5412 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   5413 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   5414 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
   5415 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   5416 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5417 ; AVX1-NEXT:    vzeroupper
   5418 ; AVX1-NEXT:    retq
   5419 ;
   5420 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
   5421 ; AVX2:       # %bb.0:
   5422 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
   5423 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   5424 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
   5425 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5426 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   5427 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   5428 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
   5429 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   5430 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   5431 ; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5432 ; AVX2-NEXT:    vzeroupper
   5433 ; AVX2-NEXT:    retq
   5434 ;
   5435 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
   5436 ; AVX512:       # %bb.0:
   5437 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
   5438 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5439 ; AVX512-NEXT:    vzeroupper
   5440 ; AVX512-NEXT:    retq
   5441   %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   5442   %2 = trunc <16 x i32> %1 to <16 x i8>
   5443   ret <16 x i8> %2
   5444 }
   5445 
   5446 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
   5447 ; SSE-LABEL: trunc_or_const_v16i16_v16i8:
   5448 ; SSE:       # %bb.0:
   5449 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   5450 ; SSE-NEXT:    pand %xmm2, %xmm1
   5451 ; SSE-NEXT:    pand %xmm2, %xmm0
   5452 ; SSE-NEXT:    packuswb %xmm1, %xmm0
   5453 ; SSE-NEXT:    por {{.*}}(%rip), %xmm0
   5454 ; SSE-NEXT:    retq
   5455 ;
   5456 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
   5457 ; AVX1:       # %bb.0:
   5458 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   5459 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5460 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   5461 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   5462 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   5463 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5464 ; AVX1-NEXT:    vzeroupper
   5465 ; AVX1-NEXT:    retq
   5466 ;
   5467 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
   5468 ; AVX2:       # %bb.0:
   5469 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   5470 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   5471 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   5472 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   5473 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   5474 ; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5475 ; AVX2-NEXT:    vzeroupper
   5476 ; AVX2-NEXT:    retq
   5477 ;
   5478 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
   5479 ; AVX512F:       # %bb.0:
   5480 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
   5481 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
   5482 ; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5483 ; AVX512F-NEXT:    vzeroupper
   5484 ; AVX512F-NEXT:    retq
   5485 ;
   5486 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
   5487 ; AVX512BW:       # %bb.0:
   5488 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
   5489 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   5490 ; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5491 ; AVX512BW-NEXT:    vzeroupper
   5492 ; AVX512BW-NEXT:    retq
   5493 ;
   5494 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
   5495 ; AVX512DQ:       # %bb.0:
   5496 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   5497 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   5498 ; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
   5499 ; AVX512DQ-NEXT:    vzeroupper
   5500 ; AVX512DQ-NEXT:    retq
   5501   %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   5502   %2 = trunc <16 x i16> %1 to <16 x i8>
   5503   ret <16 x i8> %2
   5504 }
   5505 
   5506 ;
   5507 ; complex patterns - often created by vectorizer
   5508 ;
   5509 
   5510 define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
   5511 ; SSE-LABEL: mul_add_const_v4i64_v4i32:
   5512 ; SSE:       # %bb.0:
   5513 ; SSE-NEXT:    movdqa %xmm0, %xmm2
   5514 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
   5515 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
   5516 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
   5517 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
   5518 ; SSE-NEXT:    movdqa %xmm2, %xmm4
   5519 ; SSE-NEXT:    psrlq $32, %xmm4
   5520 ; SSE-NEXT:    pmuludq %xmm1, %xmm4
   5521 ; SSE-NEXT:    movdqa %xmm1, %xmm5
   5522 ; SSE-NEXT:    psrlq $32, %xmm5
   5523 ; SSE-NEXT:    pmuludq %xmm2, %xmm5
   5524 ; SSE-NEXT:    paddq %xmm4, %xmm5
   5525 ; SSE-NEXT:    psllq $32, %xmm5
   5526 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
   5527 ; SSE-NEXT:    paddq %xmm5, %xmm2
   5528 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   5529 ; SSE-NEXT:    psrlq $32, %xmm1
   5530 ; SSE-NEXT:    pmuludq %xmm3, %xmm1
   5531 ; SSE-NEXT:    movdqa %xmm3, %xmm4
   5532 ; SSE-NEXT:    psrlq $32, %xmm4
   5533 ; SSE-NEXT:    pmuludq %xmm0, %xmm4
   5534 ; SSE-NEXT:    paddq %xmm1, %xmm4
   5535 ; SSE-NEXT:    psllq $32, %xmm4
   5536 ; SSE-NEXT:    pmuludq %xmm3, %xmm0
   5537 ; SSE-NEXT:    paddq %xmm4, %xmm0
   5538 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
   5539 ; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
   5540 ; SSE-NEXT:    retq
   5541 ;
   5542 ; AVX-LABEL: mul_add_const_v4i64_v4i32:
   5543 ; AVX:       # %bb.0:
   5544 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   5545 ; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
   5546 ; AVX-NEXT:    retq
   5547   %1 = sext <4 x i32> %a0 to <4 x i64>
   5548   %2 = sext <4 x i32> %a1 to <4 x i64>
   5549   %3 = mul <4 x i64> %1, %2
   5550   %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
   5551   %5 = trunc <4 x i64> %4 to <4 x i32>
   5552   ret <4 x i32> %5
   5553 }
   5554 
   5555 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
   5556 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
   5557 ; SSE:       # %bb.0:
   5558 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   5559 ; SSE-NEXT:    movdqa %xmm2, %xmm3
   5560 ; SSE-NEXT:    psrad $31, %xmm3
   5561 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   5562 ; SSE-NEXT:    movdqa %xmm0, %xmm3
   5563 ; SSE-NEXT:    psrad $31, %xmm3
   5564 ; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
   5565 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
   5566 ; SSE-NEXT:    movdqa %xmm3, %xmm4
   5567 ; SSE-NEXT:    psrad $31, %xmm4
   5568 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
   5569 ; SSE-NEXT:    movdqa %xmm1, %xmm4
   5570 ; SSE-NEXT:    psrad $31, %xmm4
   5571 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
   5572 ; SSE-NEXT:    movdqa %xmm0, %xmm4
   5573 ; SSE-NEXT:    psrlq $32, %xmm4
   5574 ; SSE-NEXT:    pmuludq %xmm1, %xmm4
   5575 ; SSE-NEXT:    movdqa %xmm1, %xmm5
   5576 ; SSE-NEXT:    psrlq $32, %xmm5
   5577 ; SSE-NEXT:    pmuludq %xmm0, %xmm5
   5578 ; SSE-NEXT:    paddq %xmm4, %xmm5
   5579 ; SSE-NEXT:    psllq $32, %xmm5
   5580 ; SSE-NEXT:    pmuludq %xmm0, %xmm1
   5581 ; SSE-NEXT:    paddq %xmm5, %xmm1
   5582 ; SSE-NEXT:    movdqa %xmm2, %xmm0
   5583 ; SSE-NEXT:    psrlq $32, %xmm0
   5584 ; SSE-NEXT:    pmuludq %xmm3, %xmm0
   5585 ; SSE-NEXT:    movdqa %xmm3, %xmm4
   5586 ; SSE-NEXT:    psrlq $32, %xmm4
   5587 ; SSE-NEXT:    pmuludq %xmm2, %xmm4
   5588 ; SSE-NEXT:    paddq %xmm0, %xmm4
   5589 ; SSE-NEXT:    psllq $32, %xmm4
   5590 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
   5591 ; SSE-NEXT:    paddq %xmm4, %xmm3
   5592 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
   5593 ; SSE-NEXT:    paddd %xmm1, %xmm1
   5594 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   5595 ; SSE-NEXT:    retq
   5596 ;
   5597 ; AVX-LABEL: mul_add_self_v4i64_v4i32:
   5598 ; AVX:       # %bb.0:
   5599 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   5600 ; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
   5601 ; AVX-NEXT:    retq
   5602   %1 = sext <4 x i32> %a0 to <4 x i64>
   5603   %2 = sext <4 x i32> %a1 to <4 x i64>
   5604   %3 = mul <4 x i64> %1, %2
   5605   %4 = add <4 x i64> %3, %3
   5606   %5 = trunc <4 x i64> %4 to <4 x i32>
   5607   ret <4 x i32> %5
   5608 }
   5609 
   5610 define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
   5611 ; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
   5612 ; SSE:       # %bb.0:
   5613 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
   5614 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
   5615 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
   5616 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
   5617 ; SSE-NEXT:    movdqa %xmm3, %xmm5
   5618 ; SSE-NEXT:    psrlq $32, %xmm5
   5619 ; SSE-NEXT:    pmuludq %xmm1, %xmm5
   5620 ; SSE-NEXT:    movdqa %xmm1, %xmm6
   5621 ; SSE-NEXT:    psrlq $32, %xmm6
   5622 ; SSE-NEXT:    pmuludq %xmm3, %xmm6
   5623 ; SSE-NEXT:    paddq %xmm5, %xmm6
   5624 ; SSE-NEXT:    psllq $32, %xmm6
   5625 ; SSE-NEXT:    pmuludq %xmm1, %xmm3
   5626 ; SSE-NEXT:    paddq %xmm6, %xmm3
   5627 ; SSE-NEXT:    movdqa %xmm2, %xmm1
   5628 ; SSE-NEXT:    psrlq $32, %xmm1
   5629 ; SSE-NEXT:    pmuludq %xmm4, %xmm1
   5630 ; SSE-NEXT:    movdqa %xmm4, %xmm5
   5631 ; SSE-NEXT:    psrlq $32, %xmm5
   5632 ; SSE-NEXT:    pmuludq %xmm2, %xmm5
   5633 ; SSE-NEXT:    paddq %xmm1, %xmm5
   5634 ; SSE-NEXT:    psllq $32, %xmm5
   5635 ; SSE-NEXT:    pmuludq %xmm4, %xmm2
   5636 ; SSE-NEXT:    paddq %xmm5, %xmm2
   5637 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
   5638 ; SSE-NEXT:    paddd %xmm2, %xmm0
   5639 ; SSE-NEXT:    retq
   5640 ;
   5641 ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
   5642 ; AVX:       # %bb.0:
   5643 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
   5644 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
   5645 ; AVX-NEXT:    retq
   5646   %1 = sext <4 x i32> %a0 to <4 x i64>
   5647   %2 = sext <4 x i32> %a1 to <4 x i64>
   5648   %3 = mul <4 x i64> %1, %2
   5649   %4 = add <4 x i64> %1, %3
   5650   %5 = trunc <4 x i64> %4 to <4 x i32>
   5651   ret <4 x i32> %5
   5652 }
   5653