Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
     12 
     13 ; PR31551
     14 ; Pairs of shufflevector:trunc functions with functional equivalence.
     15 ; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
     16 
     17 define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
     18 ; SSE2-LABEL: shuffle_v16i8_to_v8i8:
     19 ; SSE2:       # %bb.0:
     20 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
     21 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
     22 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     23 ; SSE2-NEXT:    movq %xmm0, (%rsi)
     24 ; SSE2-NEXT:    retq
     25 ;
     26 ; SSE42-LABEL: shuffle_v16i8_to_v8i8:
     27 ; SSE42:       # %bb.0:
     28 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
     29 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     30 ; SSE42-NEXT:    movq %xmm0, (%rsi)
     31 ; SSE42-NEXT:    retq
     32 ;
     33 ; AVX-LABEL: shuffle_v16i8_to_v8i8:
     34 ; AVX:       # %bb.0:
     35 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
     36 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     37 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
     38 ; AVX-NEXT:    retq
     39 ;
     40 ; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
     41 ; AVX512F:       # %bb.0:
     42 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
     43 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     44 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
     45 ; AVX512F-NEXT:    retq
     46 ;
     47 ; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
     48 ; AVX512VL:       # %bb.0:
     49 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
     50 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     51 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
     52 ; AVX512VL-NEXT:    retq
     53 ;
     54 ; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
     55 ; AVX512BW:       # %bb.0:
     56 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
     57 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     58 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
     59 ; AVX512BW-NEXT:    retq
     60 ;
     61 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
     62 ; AVX512BWVL:       # %bb.0:
     63 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
     64 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
     65 ; AVX512BWVL-NEXT:    retq
     66   %vec = load <16 x i8>, <16 x i8>* %L
     67   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
     68   store <8 x i8> %strided.vec, <8 x i8>* %S
     69   ret void
     70 }
     71 
     72 define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
     73 ; SSE2-LABEL: trunc_v8i16_to_v8i8:
     74 ; SSE2:       # %bb.0:
     75 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
     76 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
     77 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
     78 ; SSE2-NEXT:    movq %xmm0, (%rsi)
     79 ; SSE2-NEXT:    retq
     80 ;
     81 ; SSE42-LABEL: trunc_v8i16_to_v8i8:
     82 ; SSE42:       # %bb.0:
     83 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
     84 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     85 ; SSE42-NEXT:    movq %xmm0, (%rsi)
     86 ; SSE42-NEXT:    retq
     87 ;
     88 ; AVX-LABEL: trunc_v8i16_to_v8i8:
     89 ; AVX:       # %bb.0:
     90 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
     91 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     92 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
     93 ; AVX-NEXT:    retq
     94 ;
     95 ; AVX512F-LABEL: trunc_v8i16_to_v8i8:
     96 ; AVX512F:       # %bb.0:
     97 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
     98 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
     99 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    100 ; AVX512F-NEXT:    retq
    101 ;
    102 ; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
    103 ; AVX512VL:       # %bb.0:
    104 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    105 ; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    106 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
    107 ; AVX512VL-NEXT:    retq
    108 ;
    109 ; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
    110 ; AVX512BW:       # %bb.0:
    111 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    112 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
    113 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    114 ; AVX512BW-NEXT:    retq
    115 ;
    116 ; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
    117 ; AVX512BWVL:       # %bb.0:
    118 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    119 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
    120 ; AVX512BWVL-NEXT:    retq
    121   %vec = load <16 x i8>, <16 x i8>* %L
    122   %bc = bitcast <16 x i8> %vec to <8 x i16>
    123   %strided.vec = trunc <8 x i16> %bc to <8 x i8>
    124   store <8 x i8> %strided.vec, <8 x i8>* %S
    125   ret void
    126 }
    127 
    128 define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
    129 ; SSE2-LABEL: shuffle_v8i16_to_v4i16:
    130 ; SSE2:       # %bb.0:
    131 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
    132 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    133 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    134 ; SSE2-NEXT:    movq %xmm0, (%rsi)
    135 ; SSE2-NEXT:    retq
    136 ;
    137 ; SSE42-LABEL: shuffle_v8i16_to_v4i16:
    138 ; SSE42:       # %bb.0:
    139 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    140 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    141 ; SSE42-NEXT:    movq %xmm0, (%rsi)
    142 ; SSE42-NEXT:    retq
    143 ;
    144 ; AVX-LABEL: shuffle_v8i16_to_v4i16:
    145 ; AVX:       # %bb.0:
    146 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    147 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    148 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
    149 ; AVX-NEXT:    retq
    150 ;
    151 ; AVX512F-LABEL: shuffle_v8i16_to_v4i16:
    152 ; AVX512F:       # %bb.0:
    153 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    154 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    155 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    156 ; AVX512F-NEXT:    retq
    157 ;
    158 ; AVX512VL-LABEL: shuffle_v8i16_to_v4i16:
    159 ; AVX512VL:       # %bb.0:
    160 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    161 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
    162 ; AVX512VL-NEXT:    retq
    163 ;
    164 ; AVX512BW-LABEL: shuffle_v8i16_to_v4i16:
    165 ; AVX512BW:       # %bb.0:
    166 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    167 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    168 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    169 ; AVX512BW-NEXT:    retq
    170 ;
    171 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16:
    172 ; AVX512BWVL:       # %bb.0:
    173 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    174 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
    175 ; AVX512BWVL-NEXT:    retq
    176   %vec = load <8 x i16>, <8 x i16>* %L
    177   %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
    178   store <4 x i16> %strided.vec, <4 x i16>* %S
    179   ret void
    180 }
    181 
    182 define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
    183 ; SSE2-LABEL: trunc_v4i32_to_v4i16:
    184 ; SSE2:       # %bb.0:
    185 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
    186 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
    187 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    188 ; SSE2-NEXT:    movq %xmm0, (%rsi)
    189 ; SSE2-NEXT:    retq
    190 ;
    191 ; SSE42-LABEL: trunc_v4i32_to_v4i16:
    192 ; SSE42:       # %bb.0:
    193 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    194 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    195 ; SSE42-NEXT:    movq %xmm0, (%rsi)
    196 ; SSE42-NEXT:    retq
    197 ;
    198 ; AVX-LABEL: trunc_v4i32_to_v4i16:
    199 ; AVX:       # %bb.0:
    200 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    201 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    202 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
    203 ; AVX-NEXT:    retq
    204 ;
    205 ; AVX512F-LABEL: trunc_v4i32_to_v4i16:
    206 ; AVX512F:       # %bb.0:
    207 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    208 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    209 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
    210 ; AVX512F-NEXT:    retq
    211 ;
    212 ; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
    213 ; AVX512VL:       # %bb.0:
    214 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    215 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
    216 ; AVX512VL-NEXT:    retq
    217 ;
    218 ; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
    219 ; AVX512BW:       # %bb.0:
    220 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    221 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    222 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
    223 ; AVX512BW-NEXT:    retq
    224 ;
    225 ; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
    226 ; AVX512BWVL:       # %bb.0:
    227 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    228 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
    229 ; AVX512BWVL-NEXT:    retq
    230   %vec = load <8 x i16>, <8 x i16>* %L
    231   %bc = bitcast <8 x i16> %vec to <4 x i32>
    232   %strided.vec = trunc <4 x i32> %bc to <4 x i16>
    233   store <4 x i16> %strided.vec, <4 x i16>* %S
    234   ret void
    235 }
    236 
    237 define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
    238 ; SSE-LABEL: shuffle_v4i32_to_v2i32:
    239 ; SSE:       # %bb.0:
    240 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    241 ; SSE-NEXT:    movq %xmm0, (%rsi)
    242 ; SSE-NEXT:    retq
    243 ;
    244 ; AVX-LABEL: shuffle_v4i32_to_v2i32:
    245 ; AVX:       # %bb.0:
    246 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
    247 ; AVX-NEXT:    vmovlps %xmm0, (%rsi)
    248 ; AVX-NEXT:    retq
    249 ;
    250 ; AVX512F-LABEL: shuffle_v4i32_to_v2i32:
    251 ; AVX512F:       # %bb.0:
    252 ; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
    253 ; AVX512F-NEXT:    vmovlps %xmm0, (%rsi)
    254 ; AVX512F-NEXT:    retq
    255 ;
    256 ; AVX512VL-LABEL: shuffle_v4i32_to_v2i32:
    257 ; AVX512VL:       # %bb.0:
    258 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    259 ; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
    260 ; AVX512VL-NEXT:    retq
    261 ;
    262 ; AVX512BW-LABEL: shuffle_v4i32_to_v2i32:
    263 ; AVX512BW:       # %bb.0:
    264 ; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
    265 ; AVX512BW-NEXT:    vmovlps %xmm0, (%rsi)
    266 ; AVX512BW-NEXT:    retq
    267 ;
    268 ; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32:
    269 ; AVX512BWVL:       # %bb.0:
    270 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    271 ; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
    272 ; AVX512BWVL-NEXT:    retq
    273   %vec = load <4 x i32>, <4 x i32>* %L
    274   %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
    275   store <2 x i32> %strided.vec, <2 x i32>* %S
    276   ret void
    277 }
    278 
    279 define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
    280 ; SSE-LABEL: trunc_v2i64_to_v2i32:
    281 ; SSE:       # %bb.0:
    282 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    283 ; SSE-NEXT:    movq %xmm0, (%rsi)
    284 ; SSE-NEXT:    retq
    285 ;
    286 ; AVX-LABEL: trunc_v2i64_to_v2i32:
    287 ; AVX:       # %bb.0:
    288 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
    289 ; AVX-NEXT:    vmovlps %xmm0, (%rsi)
    290 ; AVX-NEXT:    retq
    291 ;
    292 ; AVX512F-LABEL: trunc_v2i64_to_v2i32:
    293 ; AVX512F:       # %bb.0:
    294 ; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
    295 ; AVX512F-NEXT:    vmovlps %xmm0, (%rsi)
    296 ; AVX512F-NEXT:    retq
    297 ;
    298 ; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
    299 ; AVX512VL:       # %bb.0:
    300 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    301 ; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
    302 ; AVX512VL-NEXT:    retq
    303 ;
    304 ; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
    305 ; AVX512BW:       # %bb.0:
    306 ; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
    307 ; AVX512BW-NEXT:    vmovlps %xmm0, (%rsi)
    308 ; AVX512BW-NEXT:    retq
    309 ;
    310 ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
    311 ; AVX512BWVL:       # %bb.0:
    312 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    313 ; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
    314 ; AVX512BWVL-NEXT:    retq
    315   %vec = load <4 x i32>, <4 x i32>* %L
    316   %bc = bitcast <4 x i32> %vec to <2 x i64>
    317   %strided.vec = trunc <2 x i64> %bc to <2 x i32>
    318   store <2 x i32> %strided.vec, <2 x i32>* %S
    319   ret void
    320 }
    321 
    322 define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
    323 ; SSE2-LABEL: shuffle_v16i8_to_v4i8:
    324 ; SSE2:       # %bb.0:
    325 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    326 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    327 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    328 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    329 ; SSE2-NEXT:    movd %xmm0, (%rsi)
    330 ; SSE2-NEXT:    retq
    331 ;
    332 ; SSE42-LABEL: shuffle_v16i8_to_v4i8:
    333 ; SSE42:       # %bb.0:
    334 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    335 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    336 ; SSE42-NEXT:    movd %xmm0, (%rsi)
    337 ; SSE42-NEXT:    retq
    338 ;
    339 ; AVX-LABEL: shuffle_v16i8_to_v4i8:
    340 ; AVX:       # %bb.0:
    341 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    342 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    343 ; AVX-NEXT:    vmovd %xmm0, (%rsi)
    344 ; AVX-NEXT:    retq
    345 ;
    346 ; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
    347 ; AVX512F:       # %bb.0:
    348 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    349 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    350 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    351 ; AVX512F-NEXT:    retq
    352 ;
    353 ; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
    354 ; AVX512VL:       # %bb.0:
    355 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    356 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    357 ; AVX512VL-NEXT:    retq
    358 ;
    359 ; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
    360 ; AVX512BW:       # %bb.0:
    361 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    362 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    363 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    364 ; AVX512BW-NEXT:    retq
    365 ;
    366 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
    367 ; AVX512BWVL:       # %bb.0:
    368 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    369 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    370 ; AVX512BWVL-NEXT:    retq
    371   %vec = load <16 x i8>, <16 x i8>* %L
    372   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
    373   store <4 x i8> %strided.vec, <4 x i8>* %S
    374   ret void
    375 }
    376 
    377 define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
    378 ; SSE2-LABEL: trunc_v4i32_to_v4i8:
    379 ; SSE2:       # %bb.0:
    380 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    381 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    382 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    383 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    384 ; SSE2-NEXT:    movd %xmm0, (%rsi)
    385 ; SSE2-NEXT:    retq
    386 ;
    387 ; SSE42-LABEL: trunc_v4i32_to_v4i8:
    388 ; SSE42:       # %bb.0:
    389 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    390 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    391 ; SSE42-NEXT:    movd %xmm0, (%rsi)
    392 ; SSE42-NEXT:    retq
    393 ;
    394 ; AVX-LABEL: trunc_v4i32_to_v4i8:
    395 ; AVX:       # %bb.0:
    396 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    397 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    398 ; AVX-NEXT:    vmovd %xmm0, (%rsi)
    399 ; AVX-NEXT:    retq
    400 ;
    401 ; AVX512F-LABEL: trunc_v4i32_to_v4i8:
    402 ; AVX512F:       # %bb.0:
    403 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    404 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    405 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    406 ; AVX512F-NEXT:    retq
    407 ;
    408 ; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
    409 ; AVX512VL:       # %bb.0:
    410 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    411 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
    412 ; AVX512VL-NEXT:    retq
    413 ;
    414 ; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
    415 ; AVX512BW:       # %bb.0:
    416 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    417 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
    418 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    419 ; AVX512BW-NEXT:    retq
    420 ;
    421 ; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
    422 ; AVX512BWVL:       # %bb.0:
    423 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    424 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
    425 ; AVX512BWVL-NEXT:    retq
    426   %vec = load <16 x i8>, <16 x i8>* %L
    427   %bc = bitcast <16 x i8> %vec to <4 x i32>
    428   %strided.vec = trunc <4 x i32> %bc to <4 x i8>
    429   store <4 x i8> %strided.vec, <4 x i8>* %S
    430   ret void
    431 }
    432 
    433 define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
    434 ; SSE-LABEL: shuffle_v8i16_to_v2i16:
    435 ; SSE:       # %bb.0:
    436 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    437 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    438 ; SSE-NEXT:    movd %xmm0, (%rsi)
    439 ; SSE-NEXT:    retq
    440 ;
    441 ; AVX1-LABEL: shuffle_v8i16_to_v2i16:
    442 ; AVX1:       # %bb.0:
    443 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    444 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    445 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    446 ; AVX1-NEXT:    retq
    447 ;
    448 ; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16:
    449 ; AVX2-SLOW:       # %bb.0:
    450 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    451 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    452 ; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
    453 ; AVX2-SLOW-NEXT:    retq
    454 ;
    455 ; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16:
    456 ; AVX2-FAST:       # %bb.0:
    457 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
    458 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
    459 ; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
    460 ; AVX2-FAST-NEXT:    retq
    461 ;
    462 ; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
    463 ; AVX512F:       # %bb.0:
    464 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    465 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    466 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    467 ; AVX512F-NEXT:    retq
    468 ;
    469 ; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
    470 ; AVX512VL:       # %bb.0:
    471 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    472 ; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
    473 ; AVX512VL-NEXT:    retq
    474 ;
    475 ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
    476 ; AVX512BW:       # %bb.0:
    477 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    478 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
    479 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    480 ; AVX512BW-NEXT:    retq
    481 ;
    482 ; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
    483 ; AVX512BWVL:       # %bb.0:
    484 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    485 ; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
    486 ; AVX512BWVL-NEXT:    retq
    487   %vec = load <8 x i16>, <8 x i16>* %L
    488   %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
    489   store <2 x i16> %strided.vec, <2 x i16>* %S
    490   ret void
    491 }
    492 
    493 define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
    494 ; SSE-LABEL: trunc_v2i64_to_v2i16:
    495 ; SSE:       # %bb.0:
    496 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    497 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    498 ; SSE-NEXT:    movd %xmm0, (%rsi)
    499 ; SSE-NEXT:    retq
    500 ;
    501 ; AVX1-LABEL: trunc_v2i64_to_v2i16:
    502 ; AVX1:       # %bb.0:
    503 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    504 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    505 ; AVX1-NEXT:    vmovd %xmm0, (%rsi)
    506 ; AVX1-NEXT:    retq
    507 ;
    508 ; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16:
    509 ; AVX2-SLOW:       # %bb.0:
    510 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    511 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    512 ; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
    513 ; AVX2-SLOW-NEXT:    retq
    514 ;
    515 ; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16:
    516 ; AVX2-FAST:       # %bb.0:
    517 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
    518 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
    519 ; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
    520 ; AVX2-FAST-NEXT:    retq
    521 ;
    522 ; AVX512F-LABEL: trunc_v2i64_to_v2i16:
    523 ; AVX512F:       # %bb.0:
    524 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
    525 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
    526 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
    527 ; AVX512F-NEXT:    retq
    528 ;
    529 ; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
    530 ; AVX512VL:       # %bb.0:
    531 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    532 ; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
    533 ; AVX512VL-NEXT:    retq
    534 ;
    535 ; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
    536 ; AVX512BW:       # %bb.0:
    537 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    538 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
    539 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
    540 ; AVX512BW-NEXT:    retq
    541 ;
    542 ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
    543 ; AVX512BWVL:       # %bb.0:
    544 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    545 ; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
    546 ; AVX512BWVL-NEXT:    retq
    547   %vec = load <8 x i16>, <8 x i16>* %L
    548   %bc = bitcast <8 x i16> %vec to <2 x i64>
    549   %strided.vec = trunc <2 x i64> %bc to <2 x i16>
    550   store <2 x i16> %strided.vec, <2 x i16>* %S
    551   ret void
    552 }
    553 
    554 define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    555 ; SSE2-LABEL: shuffle_v16i8_to_v2i8:
    556 ; SSE2:       # %bb.0:
    557 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    558 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    559 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    560 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    561 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    562 ; SSE2-NEXT:    movd %xmm0, %eax
    563 ; SSE2-NEXT:    movw %ax, (%rsi)
    564 ; SSE2-NEXT:    retq
    565 ;
    566 ; SSE42-LABEL: shuffle_v16i8_to_v2i8:
    567 ; SSE42:       # %bb.0:
    568 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    569 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    570 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    571 ; SSE42-NEXT:    retq
    572 ;
    573 ; AVX-LABEL: shuffle_v16i8_to_v2i8:
    574 ; AVX:       # %bb.0:
    575 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    576 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    577 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    578 ; AVX-NEXT:    retq
    579 ;
    580 ; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
    581 ; AVX512F:       # %bb.0:
    582 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    583 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    584 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    585 ; AVX512F-NEXT:    retq
    586 ;
    587 ; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
    588 ; AVX512VL:       # %bb.0:
    589 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    590 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    591 ; AVX512VL-NEXT:    retq
    592 ;
    593 ; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
    594 ; AVX512BW:       # %bb.0:
    595 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    596 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    597 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    598 ; AVX512BW-NEXT:    retq
    599 ;
    600 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
    601 ; AVX512BWVL:       # %bb.0:
    602 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    603 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    604 ; AVX512BWVL-NEXT:    retq
    605   %vec = load <16 x i8>, <16 x i8>* %L
    606   %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
    607   store <2 x i8> %strided.vec, <2 x i8>* %S
    608   ret void
    609 }
    610 
    611 define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
    612 ; SSE2-LABEL: trunc_v2i64_to_v2i8:
    613 ; SSE2:       # %bb.0:
    614 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
    615 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    616 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    617 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    618 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
    619 ; SSE2-NEXT:    movd %xmm0, %eax
    620 ; SSE2-NEXT:    movw %ax, (%rsi)
    621 ; SSE2-NEXT:    retq
    622 ;
    623 ; SSE42-LABEL: trunc_v2i64_to_v2i8:
    624 ; SSE42:       # %bb.0:
    625 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
    626 ; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    627 ; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
    628 ; SSE42-NEXT:    retq
    629 ;
    630 ; AVX-LABEL: trunc_v2i64_to_v2i8:
    631 ; AVX:       # %bb.0:
    632 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
    633 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    634 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
    635 ; AVX-NEXT:    retq
    636 ;
    637 ; AVX512F-LABEL: trunc_v2i64_to_v2i8:
    638 ; AVX512F:       # %bb.0:
    639 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
    640 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    641 ; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
    642 ; AVX512F-NEXT:    retq
    643 ;
    644 ; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
    645 ; AVX512VL:       # %bb.0:
    646 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
    647 ; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
    648 ; AVX512VL-NEXT:    retq
    649 ;
    650 ; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
    651 ; AVX512BW:       # %bb.0:
    652 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
    653 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
    654 ; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
    655 ; AVX512BW-NEXT:    retq
    656 ;
    657 ; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
    658 ; AVX512BWVL:       # %bb.0:
    659 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
    660 ; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
    661 ; AVX512BWVL-NEXT:    retq
    662   %vec = load <16 x i8>, <16 x i8>* %L
    663   %bc = bitcast <16 x i8> %vec to <2 x i64>
    664   %strided.vec = trunc <2 x i64> %bc to <2 x i8>
    665   store <2 x i8> %strided.vec, <2 x i8>* %S
    666   ret void
    667 }
    668