Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
      3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
      4 
      5 target triple = "x86_64-unknown-unknown"
      6 
      7 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
      8 ; AVX512F-LABEL: shuf2i1_1_0:
      9 ; AVX512F:       # BB#0:
     10 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     11 ; AVX512F-NEXT:    retq
     12 ;
     13 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
     14 ; VL_BW_DQ:       # BB#0:
     15 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
     16 ; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
     17 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     18 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     19 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
     20 ; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
     21 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     22 ; VL_BW_DQ-NEXT:    retq
     23   %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
     24   ret <2 x i1> %b
     25 }
     26 
     27 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
     28 ; AVX512F-LABEL: shuf2i1_1_2:
     29 ; AVX512F:       # BB#0:
     30 ; AVX512F-NEXT:    movl $1, %eax
     31 ; AVX512F-NEXT:    vmovq %rax, %xmm1
     32 ; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
     33 ; AVX512F-NEXT:    retq
     34 ;
     35 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
     36 ; VL_BW_DQ:       # BB#0:
     37 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
     38 ; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
     39 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     40 ; VL_BW_DQ-NEXT:    movb $1, %al
     41 ; VL_BW_DQ-NEXT:    kmovb %eax, %k0
     42 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
     43 ; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
     44 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
     45 ; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
     46 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     47 ; VL_BW_DQ-NEXT:    retq
     48   %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
     49   ret <2 x i1> %b
     50 }
     51 
     52 
     53 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
     54 ; AVX512F-LABEL: shuf4i1_3_2_10:
     55 ; AVX512F:       # BB#0:
     56 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
     57 ; AVX512F-NEXT:    retq
     58 ;
     59 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
     60 ; VL_BW_DQ:       # BB#0:
     61 ; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
     62 ; VL_BW_DQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
     63 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
     64 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
     65 ; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
     66 ; VL_BW_DQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
     67 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
     68 ; VL_BW_DQ-NEXT:    retq
     69   %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
     70   ret <4 x i1> %b
     71 }
     72 
     73 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
     74 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
     75 ; AVX512F:       # BB#0:
     76 ; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
     77 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
     78 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
     79 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
     80 ; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
     81 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
     82 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
     83 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
     84 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
     85 ; AVX512F-NEXT:    retq
     86 ;
     87 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
     88 ; VL_BW_DQ:       # BB#0:
     89 ; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
     90 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
     91 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
     92 ; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
     93 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
     94 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
     95 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
     96 ; VL_BW_DQ-NEXT:    retq
     97   %a2 = icmp eq <8 x i64> %a, %a1
     98   %b2 = icmp eq <8 x i64> %b, %b1
     99   %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    100   ret <8 x i1> %c
    101 }
    102 
    103 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
    104 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    105 ; AVX512F:       # BB#0:
    106 ; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
    107 ; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
    108 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    109 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2} {z}
    110 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1} {z}
    111 ; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    112 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
    113 ; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm1
    114 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
    115 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
    116 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    117 ; AVX512F-NEXT:    retq
    118 ;
    119 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    120 ; VL_BW_DQ:       # BB#0:
    121 ; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
    122 ; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
    123 ; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
    124 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
    125 ; VL_BW_DQ-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    126 ; VL_BW_DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
    127 ; VL_BW_DQ-NEXT:    vpslld $31, %zmm1, %zmm0
    128 ; VL_BW_DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
    129 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
    130 ; VL_BW_DQ-NEXT:    retq
    131   %a2 = icmp eq <16 x i32> %a, %a1
    132   %b2 = icmp eq <16 x i32> %b, %b1
    133   %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    134   ret <16 x i1> %c
    135 }
    136 
    137 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
    138 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    139 ; AVX512F:       # BB#0:
    140 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
    141 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
    142 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
    143 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
    144 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
    145 ; AVX512F-NEXT:    retq
    146 ;
    147 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    148 ; VL_BW_DQ:       # BB#0:
    149 ; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
    150 ; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
    151 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
    152 ; VL_BW_DQ-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    153 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
    154 ; VL_BW_DQ-NEXT:    vpsllw $15, %zmm0, %zmm0
    155 ; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
    156 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
    157 ; VL_BW_DQ-NEXT:    retq
    158   %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    159   ret <32 x i1> %b
    160 }
    161 
    162 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
    163 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
    164 ; AVX512F:       # BB#0:
    165 ; AVX512F-NEXT:    kmovw %edi, %k1
    166 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    167 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
    168 ; AVX512F-NEXT:    vextracti32x4 $1, %zmm1, %xmm1
    169 ; AVX512F-NEXT:    vpbroadcastq %xmm1, %zmm1
    170 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
    171 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
    172 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
    173 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
    174 ; AVX512F-NEXT:    retq
    175 ;
    176 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
    177 ; VL_BW_DQ:       # BB#0:
    178 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    179 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    180 ; VL_BW_DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
    181 ; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %zmm0
    182 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
    183 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
    184 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
    185 ; VL_BW_DQ-NEXT:    retq
    186   %b = bitcast i8 %a to <8 x i1>
    187   %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
    188   ret <8 x i1> %c
    189 }
    190 
    191 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
    192 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
    193 ; AVX512F:       # BB#0:
    194 ; AVX512F-NEXT:    kmovw %edi, %k1
    195 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    196 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
    197 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    198 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
    199 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    200 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    201 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    202 ; AVX512F-NEXT:    kmovw %k0, %eax
    203 ; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    204 ; AVX512F-NEXT:    retq
    205 ;
    206 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
    207 ; VL_BW_DQ:       # BB#0:
    208 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    209 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    210 ; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    211 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
    212 ; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    213 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
    214 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
    215 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    216 ; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    217 ; VL_BW_DQ-NEXT:    retq
    218   %b = bitcast i8 %a to <8 x i1>
    219   %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
    220   %d = bitcast <8 x i1> %c to i8
    221   ret i8 %d
    222 }
    223 
    224 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
    225 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
    226 ; AVX512F:       # BB#0:
    227 ; AVX512F-NEXT:    kmovw %edi, %k1
    228 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    229 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
    230 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
    231 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    232 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    233 ; AVX512F-NEXT:    kmovw %k0, %eax
    234 ; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    235 ; AVX512F-NEXT:    retq
    236 ;
    237 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
    238 ; VL_BW_DQ:       # BB#0:
    239 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    240 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    241 ; VL_BW_DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
    242 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
    243 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
    244 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    245 ; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    246 ; VL_BW_DQ-NEXT:    retq
    247   %b = bitcast i8 %a to <8 x i1>
    248   %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
    249   %d = bitcast <8 x i1> %c to i8
    250   ret i8 %d
    251 }
    252 
    253 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
    254 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
    255 ; AVX512F:       # BB#0:
    256 ; AVX512F-NEXT:    kmovw %edi, %k1
    257 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    258 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
    259 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    260 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
    261 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    262 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    263 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    264 ; AVX512F-NEXT:    kmovw %k0, %eax
    265 ; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    266 ; AVX512F-NEXT:    retq
    267 ;
    268 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
    269 ; VL_BW_DQ:       # BB#0:
    270 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    271 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    272 ; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    273 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
    274 ; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    275 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
    276 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
    277 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    278 ; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    279 ; VL_BW_DQ-NEXT:    retq
    280   %b = bitcast i8 %a to <8 x i1>
    281   %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    282   %d = bitcast <8 x i1>%c to i8
    283   ret i8 %d
    284 }
    285 
    286 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
    287 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
    288 ; AVX512F:       # BB#0:
    289 ; AVX512F-NEXT:    kmovw %edi, %k1
    290 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    291 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
    292 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
    293 ; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
    294 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    295 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
    296 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    297 ; AVX512F-NEXT:    kmovw %k0, %eax
    298 ; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    299 ; AVX512F-NEXT:    retq
    300 ;
    301 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
    302 ; VL_BW_DQ:       # BB#0:
    303 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    304 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    305 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
    306 ; VL_BW_DQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
    307 ; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    308 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm2, %zmm0
    309 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
    310 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    311 ; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    312 ; VL_BW_DQ-NEXT:    retq
    313   %b = bitcast i8 %a to <8 x i1>
    314   %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
    315   %d = bitcast <8 x i1>%c to i8
    316   ret i8 %d
    317 }
    318 
    319 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
    320 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
    321 ; AVX512F:       # BB#0:
    322 ; AVX512F-NEXT:    kmovw %edi, %k1
    323 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    324 ; AVX512F-NEXT:    movb $51, %al
    325 ; AVX512F-NEXT:    kmovw %eax, %k2
    326 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2} {z}
    327 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
    328 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
    329 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
    330 ; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm0
    331 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    332 ; AVX512F-NEXT:    kmovw %k0, %eax
    333 ; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    334 ; AVX512F-NEXT:    retq
    335 ;
    336 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
    337 ; VL_BW_DQ:       # BB#0:
    338 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    339 ; VL_BW_DQ-NEXT:    movb $51, %al
    340 ; VL_BW_DQ-NEXT:    kmovb %eax, %k1
    341 ; VL_BW_DQ-NEXT:    vpmovm2q %k1, %zmm0
    342 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm1
    343 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
    344 ; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    345 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
    346 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
    347 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    348 ; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    349 ; VL_BW_DQ-NEXT:    retq
    350   %b = bitcast i8 %a to <8 x i1>
    351   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
    352   %c1 = bitcast <8 x i1>%c to i8
    353   ret i8 %c1
    354 }
    355 
    356 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
    357 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
    358 ; AVX512F:       # BB#0:
    359 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
    360 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    361 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    362 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    363 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
    364 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
    365 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    366 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    367 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    368 ; AVX512F-NEXT:    kmovw %k0, %eax
    369 ; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    370 ; AVX512F-NEXT:    retq
    371 ;
    372 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
    373 ; VL_BW_DQ:       # BB#0:
    374 ; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
    375 ; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
    376 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    377 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
    378 ; VL_BW_DQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    379 ; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    380 ; VL_BW_DQ-NEXT:    vpsllq $63, %zmm2, %zmm0
    381 ; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
    382 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    383 ; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
    384 ; VL_BW_DQ-NEXT:    retq
    385   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    386   %c1 = bitcast <8 x i1>%c to i8
    387   ret i8 %c1
    388 }
    389 
    390 
    391 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
    392 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
    393 ; AVX512F:       # BB#0:
    394 ; AVX512F-NEXT:    kmovw %edi, %k1
    395 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    396 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
    397 ; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
    398 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
    399 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
    400 ; AVX512F-NEXT:    kmovw %k0, %eax
    401 ; AVX512F-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
    402 ; AVX512F-NEXT:    retq
    403 ;
    404 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
    405 ; VL_BW_DQ:       # BB#0:
    406 ; VL_BW_DQ-NEXT:    kmovw %edi, %k0
    407 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
    408 ; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
    409 ; VL_BW_DQ-NEXT:    vpslld $31, %zmm0, %zmm0
    410 ; VL_BW_DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
    411 ; VL_BW_DQ-NEXT:    kmovw %k0, %eax
    412 ; VL_BW_DQ-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
    413 ; VL_BW_DQ-NEXT:    retq
    414   %b = bitcast i16 %a to <16 x i1>
    415   %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
    416   %d = bitcast <16 x i1> %c to i16
    417   ret i16 %d
    418 }
    419 
    420 define i64 @shuf64i1_zero(i64 %a) {
    421 ; AVX512F-LABEL: shuf64i1_zero:
    422 ; AVX512F:       # BB#0:
    423 ; AVX512F-NEXT:    pushq %rbp
    424 ; AVX512F-NEXT:  .Ltmp0:
    425 ; AVX512F-NEXT:    .cfi_def_cfa_offset 16
    426 ; AVX512F-NEXT:  .Ltmp1:
    427 ; AVX512F-NEXT:    .cfi_offset %rbp, -16
    428 ; AVX512F-NEXT:    movq %rsp, %rbp
    429 ; AVX512F-NEXT:  .Ltmp2:
    430 ; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
    431 ; AVX512F-NEXT:    andq $-32, %rsp
    432 ; AVX512F-NEXT:    subq $96, %rsp
    433 ; AVX512F-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
    434 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
    435 ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
    436 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
    437 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    438 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
    439 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
    440 ; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
    441 ; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
    442 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
    443 ; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
    444 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
    445 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
    446 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
    447 ; AVX512F-NEXT:    kmovw %k0, (%rsp)
    448 ; AVX512F-NEXT:    movl (%rsp), %ecx
    449 ; AVX512F-NEXT:    movq %rcx, %rax
    450 ; AVX512F-NEXT:    shlq $32, %rax
    451 ; AVX512F-NEXT:    orq %rcx, %rax
    452 ; AVX512F-NEXT:    movq %rbp, %rsp
    453 ; AVX512F-NEXT:    popq %rbp
    454 ; AVX512F-NEXT:    retq
    455 ;
    456 ; VL_BW_DQ-LABEL: shuf64i1_zero:
    457 ; VL_BW_DQ:       # BB#0:
    458 ; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
    459 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
    460 ; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
    461 ; VL_BW_DQ-NEXT:    vpsllw $7, %zmm0, %zmm0
    462 ; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
    463 ; VL_BW_DQ-NEXT:    kmovq %k0, %rax
    464 ; VL_BW_DQ-NEXT:    retq
    465   %b = bitcast i64 %a to <64 x i1>
    466   %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
    467   %d = bitcast <64 x i1> %c to i64
    468   ret i64 %d
    469 }
    470