Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
      2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
      3 
      4 target triple = "x86_64-unknown-unknown"
      5 
      6 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
      7 ; AVX512F-LABEL: shuf2i1_1_0:
      8 ; AVX512F:       # BB#0:
      9 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     10 ; AVX512F-NEXT:    retq
     11 ;
     12 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
     13 ; VL_BW_DQ:       # BB#0:
     14 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     15 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     16 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     17 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     18 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     19 ; VL_BW_DQ-NEXT:    retq
     20   %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
     21   ret <2 x i1> %b
     22 }
     23 
     24 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
     25 ; AVX512F-LABEL: shuf2i1_1_2:
     26 ; AVX512F:       # BB#0:
     27 ; AVX512F-NEXT:    movl $1, %eax
     28 ; AVX512F-NEXT:    vmovq %rax, %xmm1
     29 ; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
     30 ; AVX512F-NEXT:    retq
     31 ;
     32 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
     33 ; VL_BW_DQ:       # BB#0:
     34 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     35 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     36 ; VL_BW_DQ-NEXT:    movb $1, %al
     37 ; VL_BW_DQ-NEXT:    kmovb %eax, %k0
     38 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
     39 ; VL_BW_DQ-NEXT:    vpalignr $8, %xmm0, %xmm1, %xmm0
     40 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     41 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     42 ; VL_BW_DQ-NEXT:    retq
     43   %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
     44   ret <2 x i1> %b
     45 }
     46 
     47 
     48 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
     49 ; AVX512F-LABEL: shuf4i1_3_2_10:
     50 ; AVX512F:       # BB#0:
     51 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
     52 ; AVX512F-NEXT:    retq
     53 ;
     54 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
     55 ; VL_BW_DQ:       # BB#0:
     56 ; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
     57 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
     58 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
     59 ; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
     60 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
     61 ; VL_BW_DQ-NEXT:    retq
     62   %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
     63   ret <4 x i1> %b
     64 }
     65 
     66 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
     67 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
     68 ; AVX512F:       # BB#0:
     69 ; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
     70 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
     71 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k1} {z}
     72 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
     73 ; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
     74 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
     75 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
     76 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
     77 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
     78 ; AVX512F-NEXT:    retq
     79 ;
     80 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
     81 ; VL_BW_DQ:       # BB#0:
     82 ; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
     83 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
     84 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
     85 ; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
     86 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
     87 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
     88 ; VL_BW_DQ-NEXT:    retq
     89   %a2 = icmp eq <8 x i64> %a, %a1
     90   %b2 = icmp eq <8 x i64> %b, %b1
     91   %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
     92   ret <8 x i1> %c
     93 }
     94 
     95 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
     96 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
     97 ; AVX512F:       # BB#0:
     98 ; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
     99 ; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
    100 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0
    101 ; AVX512F-NEXT:    vmovdqu32 %zmm0, %zmm1 {%k2} {z}
    102 ; AVX512F-NEXT:    vmovdqu32 %zmm0, %zmm2 {%k1} {z}
    103 ; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    104 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
    105 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm1
    106 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
    107 ; AVX512F-NEXT:    vmovdqu32 %zmm0, %zmm0 {%k1} {z}
    108 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    109 ; AVX512F-NEXT:    retq
    110 ;
    111 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    112 ; VL_BW_DQ:       # BB#0:
    113 ; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
    114 ; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
    115 ; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
    116 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
    117 ; VL_BW_DQ-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    118 ; VL_BW_DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
    119 ; VL_BW_DQ-NEXT:    vpmovd2m %zmm1, %k0
    120 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
    121 ; VL_BW_DQ-NEXT:    retq
    122   %a2 = icmp eq <16 x i32> %a, %a1
    123   %b2 = icmp eq <16 x i32> %b, %b1
    124   %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    125   ret <16 x i1> %c
    126 }
    127 
    128 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
    129 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    130 ; AVX512F:       # BB#0:
    131 ; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
    132 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
    133 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
    134 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
    135 ; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
    136 ; AVX512F-NEXT:    retq
    137 ;
    138 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    139 ; VL_BW_DQ:       # BB#0:
    140 ; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
    141 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
    142 ; VL_BW_DQ-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    143 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
    144 ; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
    145 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
    146 ; VL_BW_DQ-NEXT:    retq
    147   %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    148   ret <32 x i1> %b
    149 }
    150 
    151 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
    152 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
    153 ; AVX512F:       # BB#0:
    154 ; AVX512F-NEXT:    movzbl %dil, %eax
    155 ; AVX512F-NEXT:    kmovw %eax, %k1
    156 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
    157 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k1} {z}
    158 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
    159 ; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
    160 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
    161 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
    162 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
    163 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
    164 ; AVX512F-NEXT:    retq
    165 ;
    166 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
    167 ; VL_BW_DQ:       # BB#0:
    168 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    169 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    170 ; VL_BW_DQ-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
    171 ; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
    172 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
    173 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
    174 ; VL_BW_DQ-NEXT:    retq
    175   %b = bitcast i8 %a to <8 x i1>
    176   %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
    177   ret <8 x i1> %c
    178 }
    179 
    180 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
    181 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
    182 ; AVX512F:       # BB#0:
    183 ; AVX512F-NEXT:    movzbl %dil, %eax
    184 ; AVX512F-NEXT:    kmovw %eax, %k1
    185 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
    186 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    187 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
    188 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    189 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    190 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    191 ; AVX512F-NEXT:    kmovw %k0, %eax
    192 ; AVX512F-NEXT:    retq
    193 ;
    194 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
    195 ; VL_BW_DQ:       # BB#0:
    196 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    197 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    198 ; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    199 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
    200 ; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    201 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
    202 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    203 ; VL_BW_DQ-NEXT:    retq
    204   %b = bitcast i8 %a to <8 x i1>
    205   %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
    206   %d = bitcast <8 x i1> %c to i8
    207   ret i8 %d
    208 }
    209 
    210 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
    211 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
    212 ; AVX512F:       # BB#0:
    213 ; AVX512F-NEXT:    movzbl %dil, %eax
    214 ; AVX512F-NEXT:    kmovw %eax, %k1
    215 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
    216 ; AVX512F-NEXT:    vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
    217 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    218 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    219 ; AVX512F-NEXT:    kmovw %k0, %eax
    220 ; AVX512F-NEXT:    retq
    221 ;
    222 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
    223 ; VL_BW_DQ:       # BB#0:
    224 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    225 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    226 ; VL_BW_DQ-NEXT:    vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
    227 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
    228 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    229 ; VL_BW_DQ-NEXT:    retq
    230   %b = bitcast i8 %a to <8 x i1>
    231   %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
    232   %d = bitcast <8 x i1> %c to i8
    233   ret i8 %d
    234 }
    235 
    236 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
    237 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
    238 ; AVX512F:       # BB#0:
    239 ; AVX512F-NEXT:    movzbl %dil, %eax
    240 ; AVX512F-NEXT:    kmovw %eax, %k1
    241 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
    242 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    243 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
    244 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    245 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
    246 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    247 ; AVX512F-NEXT:    kmovw %k0, %eax
    248 ; AVX512F-NEXT:    retq
    249 ;
    250 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
    251 ; VL_BW_DQ:       # BB#0:
    252 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    253 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    254 ; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
    255 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
    256 ; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    257 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
    258 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    259 ; VL_BW_DQ-NEXT:    retq
    260   %b = bitcast i8 %a to <8 x i1>
    261   %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    262   %d = bitcast <8 x i1>%c to i8
    263   ret i8 %d
    264 }
    265 
    266 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
    267 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
    268 ; AVX512F:       # BB#0:
    269 ; AVX512F-NEXT:    movzbl %dil, %eax
    270 ; AVX512F-NEXT:    kmovw %eax, %k1
    271 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
    272 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
    273 ; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
    274 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    275 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm0
    276 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    277 ; AVX512F-NEXT:    kmovw %k0, %eax
    278 ; AVX512F-NEXT:    retq
    279 ;
    280 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
    281 ; VL_BW_DQ:       # BB#0:
    282 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    283 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    284 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
    285 ; VL_BW_DQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
    286 ; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    287 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
    288 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    289 ; VL_BW_DQ-NEXT:    retq
    290   %b = bitcast i8 %a to <8 x i1>
    291   %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
    292   %d = bitcast <8 x i1>%c to i8
    293   ret i8 %d
    294 }
    295 
    296 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
    297 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
    298 ; AVX512F:       # BB#0:
    299 ; AVX512F-NEXT:    movzbl %dil, %eax
    300 ; AVX512F-NEXT:    kmovw %eax, %k1
    301 ; AVX512F-NEXT:    movb $51, %al
    302 ; AVX512F-NEXT:    movzbl %al, %eax
    303 ; AVX512F-NEXT:    kmovw %eax, %k2
    304 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
    305 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k2} {z}
    306 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm0 {%k1} {z}
    307 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
    308 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
    309 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm0
    310 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    311 ; AVX512F-NEXT:    kmovw %k0, %eax
    312 ; AVX512F-NEXT:    retq
    313 ;
    314 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
    315 ; VL_BW_DQ:       # BB#0:
    316 ; VL_BW_DQ-NEXT:    kmovb %edi, %k0
    317 ; VL_BW_DQ-NEXT:    movb $51, %al
    318 ; VL_BW_DQ-NEXT:    kmovb %eax, %k1
    319 ; VL_BW_DQ-NEXT:    vpmovm2q %k1, %zmm0
    320 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm1
    321 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
    322 ; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
    323 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
    324 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    325 ; VL_BW_DQ-NEXT:    retq
    326   %b = bitcast i8 %a to <8 x i1>
    327   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
    328   %c1 = bitcast <8 x i1>%c to i8
    329   ret i8 %c1
    330 }
    331 
    332 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
    333 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
    334 ; AVX512F:       # BB#0:
    335 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
    336 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm1
    337 ; AVX512F-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    338 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    339 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
    340 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
    341 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm3
    342 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm3
    343 ; AVX512F-NEXT:    vpandq %zmm3, %zmm1, %zmm0
    344 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    345 ; AVX512F-NEXT:    kmovw %k0, %eax
    346 ; AVX512F-NEXT:    retq
    347 ;
    348 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
    349 ; VL_BW_DQ:       # BB#0:
    350 ; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
    351 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
    352 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
    353 ; VL_BW_DQ-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
    354 ; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    355 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
    356 ; VL_BW_DQ-NEXT:    kmovb %k0, %eax
    357 ; VL_BW_DQ-NEXT:    retq
    358   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    359   %c1 = bitcast <8 x i1>%c to i8
    360   ret i8 %c1
    361 }
    362 
    363 
    364 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
    365 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
    366 ; AVX512F:       # BB#0:
    367 ; AVX512F-NEXT:    kmovw %edi, %k1
    368 ; AVX512F-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
    369 ; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
    370 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
    371 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
    372 ; AVX512F-NEXT:    kmovw %k0, %eax
    373 ; AVX512F-NEXT:    retq
    374 ;
    375 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
    376 ; VL_BW_DQ:       # BB#0:
    377 ; VL_BW_DQ-NEXT:    kmovw %edi, %k0
    378 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
    379 ; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
    380 ; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
    381 ; VL_BW_DQ-NEXT:    kmovw %k0, %eax
    382 ; VL_BW_DQ-NEXT:    retq
    383   %b = bitcast i16 %a to <16 x i1>
    384   %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
    385   %d = bitcast <16 x i1> %c to i16
    386   ret i16 %d
    387 }
    388 
    389 define i64 @shuf64i1_zero(i64 %a) {
    390 ; VL_BW_DQ-LABEL: shuf64i1_zero:
    391 ; VL_BW_DQ:       # BB#0:
    392 ; VL_BW_DQ-NEXT:    kxorq %k0, %k0, %k0
    393 ; VL_BW_DQ-NEXT:    kmovq %k0, %rax
    394 ; VL_BW_DQ-NEXT:    retq
    395   %b = bitcast i64 %a to <64 x i1>
    396   %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer
    397   %d = bitcast <64 x i1> %c to i64
    398   ret i64 %d
    399 }
    400