Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ
      5 
      6 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
      7 ; AVX512F-LABEL: shuf2i1_1_0:
      8 ; AVX512F:       # %bb.0:
      9 ; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
     10 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
     11 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
     12 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     13 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
     14 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
     15 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
     16 ; AVX512F-NEXT:    vzeroupper
     17 ; AVX512F-NEXT:    retq
     18 ;
     19 ; AVX512VL-LABEL: shuf2i1_1_0:
     20 ; AVX512VL:       # %bb.0:
     21 ; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
     22 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
     23 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
     24 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
     25 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
     26 ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
     27 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
     28 ; AVX512VL-NEXT:    retq
     29 ;
     30 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
     31 ; VL_BW_DQ:       # %bb.0:
     32 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
     33 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     34 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     35 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     36 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     37 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     38 ; VL_BW_DQ-NEXT:    retq
     39   %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
     40   ret <2 x i1> %b
     41 }
     42 
     43 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
     44 ; AVX512F-LABEL: shuf2i1_1_2:
     45 ; AVX512F:       # %bb.0:
     46 ; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
     47 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
     48 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
     49 ; AVX512F-NEXT:    movq $-1, %rax
     50 ; AVX512F-NEXT:    vmovq %rax, %xmm1
     51 ; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
     52 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
     53 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
     54 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
     55 ; AVX512F-NEXT:    vzeroupper
     56 ; AVX512F-NEXT:    retq
     57 ;
     58 ; AVX512VL-LABEL: shuf2i1_1_2:
     59 ; AVX512VL:       # %bb.0:
     60 ; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
     61 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
     62 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
     63 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
     64 ; AVX512VL-NEXT:    movq $-1, %rax
     65 ; AVX512VL-NEXT:    vmovq %rax, %xmm2
     66 ; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
     67 ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
     68 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
     69 ; AVX512VL-NEXT:    retq
     70 ;
     71 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
     72 ; VL_BW_DQ:       # %bb.0:
     73 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
     74 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     75 ; VL_BW_DQ-NEXT:    movq $-1, %rax
     76 ; VL_BW_DQ-NEXT:    vmovq %rax, %xmm0
     77 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
     78 ; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
     79 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
     80 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
     81 ; VL_BW_DQ-NEXT:    retq
     82   %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
     83   ret <2 x i1> %b
     84 }
     85 
     86 
     87 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
     88 ; AVX512F-LABEL: shuf4i1_3_2_10:
     89 ; AVX512F:       # %bb.0:
     90 ; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
     91 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
     92 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
     93 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
     94 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
     95 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
     96 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
     97 ; AVX512F-NEXT:    vzeroupper
     98 ; AVX512F-NEXT:    retq
     99 ;
    100 ; AVX512VL-LABEL: shuf4i1_3_2_10:
    101 ; AVX512VL:       # %bb.0:
    102 ; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
    103 ; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1
    104 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
    105 ; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1} {z}
    106 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
    107 ; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
    108 ; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
    109 ; AVX512VL-NEXT:    retq
    110 ;
    111 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
    112 ; VL_BW_DQ:       # %bb.0:
    113 ; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
    114 ; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
    115 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
    116 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
    117 ; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
    118 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
    119 ; VL_BW_DQ-NEXT:    retq
    120   %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
    121   ret <4 x i1> %b
    122 }
    123 
    124 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
    125 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
    126 ; AVX512F:       # %bb.0:
    127 ; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
    128 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    129 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
    130 ; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
    131 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    132 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    133 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    134 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    135 ; AVX512F-NEXT:    vzeroupper
    136 ; AVX512F-NEXT:    retq
    137 ;
    138 ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
    139 ; AVX512VL:       # %bb.0:
    140 ; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
    141 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    142 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
    143 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
    144 ; AVX512VL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
    145 ; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
    146 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
    147 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
    148 ; AVX512VL-NEXT:    vzeroupper
    149 ; AVX512VL-NEXT:    retq
    150 ;
    151 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
    152 ; VL_BW_DQ:       # %bb.0:
    153 ; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
    154 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    155 ; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
    156 ; VL_BW_DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
    157 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
    158 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
    159 ; VL_BW_DQ-NEXT:    vzeroupper
    160 ; VL_BW_DQ-NEXT:    retq
    161   %a2 = icmp eq <8 x i64> %a, %a1
    162   %b2 = icmp eq <8 x i64> %b, %b1
    163   %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    164   ret <8 x i1> %c
    165 }
    166 
    167 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
    168 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    169 ; AVX512F:       # %bb.0:
    170 ; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
    171 ; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
    172 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    173 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    174 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    175 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
    176 ; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
    177 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    178 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    179 ; AVX512F-NEXT:    vzeroupper
    180 ; AVX512F-NEXT:    retq
    181 ;
    182 ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    183 ; AVX512VL:       # %bb.0:
    184 ; AVX512VL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
    185 ; AVX512VL-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
    186 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    187 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    188 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    189 ; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
    190 ; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
    191 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    192 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
    193 ; AVX512VL-NEXT:    vzeroupper
    194 ; AVX512VL-NEXT:    retq
    195 ;
    196 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    197 ; VL_BW_DQ:       # %bb.0:
    198 ; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
    199 ; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
    200 ; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
    201 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
    202 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    203 ; VL_BW_DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
    204 ; VL_BW_DQ-NEXT:    vpmovd2m %zmm2, %k0
    205 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
    206 ; VL_BW_DQ-NEXT:    vzeroupper
    207 ; VL_BW_DQ-NEXT:    retq
    208   %a2 = icmp eq <16 x i32> %a, %a1
    209   %b2 = icmp eq <16 x i32> %b, %b1
    210   %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    211   ret <16 x i1> %c
    212 }
    213 
    214 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
    215 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    216 ; AVX512F:       # %bb.0:
    217 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
    218 ; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
    219 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
    220 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
    221 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
    222 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
    223 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
    224 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    225 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    226 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    227 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
    228 ; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
    229 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    230 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    231 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    232 ; AVX512F-NEXT:    retq
    233 ;
    234 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    235 ; AVX512VL:       # %bb.0:
    236 ; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm1
    237 ; AVX512VL-NEXT:    vpslld $31, %zmm1, %zmm1
    238 ; AVX512VL-NEXT:    vptestmd %zmm1, %zmm1, %k1
    239 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
    240 ; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
    241 ; AVX512VL-NEXT:    vpslld $31, %zmm0, %zmm0
    242 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
    243 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    244 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    245 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    246 ; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
    247 ; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
    248 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    249 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
    250 ; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    251 ; AVX512VL-NEXT:    retq
    252 ;
    253 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
    254 ; VL_BW_DQ:       # %bb.0:
    255 ; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
    256 ; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
    257 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
    258 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    259 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
    260 ; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
    261 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
    262 ; VL_BW_DQ-NEXT:    retq
    263   %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    264   ret <32 x i1> %b
    265 }
    266 
    267 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
    268 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
    269 ; AVX512F:       # %bb.0:
    270 ; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
    271 ; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm0
    272 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
    273 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
    274 ; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm1, %ymm0
    275 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
    276 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
    277 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    278 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    279 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    280 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
    281 ; AVX512F-NEXT:    vptestmd %zmm6, %zmm6, %k1
    282 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    283 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm1
    284 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
    285 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
    286 ; AVX512F-NEXT:    retq
    287 ;
    288 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
    289 ; AVX512VL:       # %bb.0:
    290 ; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
    291 ; AVX512VL-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm0
    292 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
    293 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k1
    294 ; AVX512VL-NEXT:    vpcmpeqw %ymm6, %ymm1, %ymm0
    295 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
    296 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
    297 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    298 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    299 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    300 ; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
    301 ; AVX512VL-NEXT:    vptestmd %zmm6, %zmm6, %k1
    302 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    303 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm1
    304 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
    305 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
    306 ; AVX512VL-NEXT:    retq
    307 ;
    308 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
    309 ; VL_BW_DQ:       # %bb.0:
    310 ; VL_BW_DQ-NEXT:    vptestnmw %zmm0, %zmm0, %k0
    311 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
    312 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    313 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
    314 ; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
    315 ; VL_BW_DQ-NEXT:    vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
    316 ; VL_BW_DQ-NEXT:    retq
    317   %cmp = icmp eq <32 x i16> %a, zeroinitializer
    318   %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    319   %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
    320   ret <32 x i16> %sel
    321 }
    322 
    323 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
    324 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
    325 ; AVX512F:       # %bb.0:
    326 ; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    327 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
    328 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm3
    329 ; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
    330 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
    331 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
    332 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
    333 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    334 ; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
    335 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    336 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
    337 ; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
    338 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    339 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    340 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    341 ; AVX512F-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
    342 ; AVX512F-NEXT:    retq
    343 ;
    344 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
    345 ; AVX512VL:       # %bb.0:
    346 ; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    347 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
    348 ; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm3
    349 ; AVX512VL-NEXT:    vptestmd %zmm3, %zmm3, %k1
    350 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
    351 ; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
    352 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
    353 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    354 ; AVX512VL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
    355 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    356 ; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
    357 ; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
    358 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    359 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
    360 ; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    361 ; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
    362 ; AVX512VL-NEXT:    retq
    363 ;
    364 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
    365 ; VL_BW_DQ:       # %bb.0:
    366 ; VL_BW_DQ-NEXT:    vptestnmb %ymm0, %ymm0, %k0
    367 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
    368 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    369 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
    370 ; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
    371 ; VL_BW_DQ-NEXT:    vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
    372 ; VL_BW_DQ-NEXT:    retq
    373   %cmp = icmp eq <32 x i8> %a, zeroinitializer
    374   %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    375   %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
    376   ret <32 x i8> %sel
    377 }
    378 
    379 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
    380 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
    381 ; AVX512F:       # %bb.0:
    382 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
    383 ; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
    384 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    385 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    386 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    387 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
    388 ; AVX512F-NEXT:    vptestmd %zmm6, %zmm6, %k1
    389 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    390 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm1
    391 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
    392 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
    393 ; AVX512F-NEXT:    retq
    394 ;
    395 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
    396 ; AVX512VL:       # %bb.0:
    397 ; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
    398 ; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
    399 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    400 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    401 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    402 ; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
    403 ; AVX512VL-NEXT:    vptestmd %zmm6, %zmm6, %k1
    404 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    405 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm1
    406 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
    407 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
    408 ; AVX512VL-NEXT:    retq
    409 ;
    410 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
    411 ; VL_BW_DQ:       # %bb.0:
    412 ; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    413 ; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
    414 ; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
    415 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
    416 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    417 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
    418 ; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
    419 ; VL_BW_DQ-NEXT:    vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
    420 ; VL_BW_DQ-NEXT:    retq
    421   %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
    422   %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
    423   %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    424   %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    425   %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
    426   ret <32 x i16> %sel
    427 }
    428 
    429 define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
    430 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
    431 ; AVX512F:       # %bb.0:
    432 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
    433 ; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
    434 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    435 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    436 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    437 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
    438 ; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
    439 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    440 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
    441 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    442 ; AVX512F-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
    443 ; AVX512F-NEXT:    retq
    444 ;
    445 ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
    446 ; AVX512VL:       # %bb.0:
    447 ; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
    448 ; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
    449 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
    450 ; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
    451 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    452 ; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
    453 ; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
    454 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    455 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
    456 ; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
    457 ; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
    458 ; AVX512VL-NEXT:    retq
    459 ;
    460 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
    461 ; VL_BW_DQ:       # %bb.0:
    462 ; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
    463 ; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
    464 ; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
    465 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
    466 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
    467 ; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
    468 ; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
    469 ; VL_BW_DQ-NEXT:    vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
    470 ; VL_BW_DQ-NEXT:    retq
    471   %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
    472   %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
    473   %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
    474   %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
    475   %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
    476   ret <32 x i8> %sel
    477 }
    478 
    479 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
    480 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
    481 ; AVX512F:       # %bb.0:
    482 ; AVX512F-NEXT:    kmovw %edi, %k1
    483 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    484 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
    485 ; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
    486 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    487 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    488 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    489 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
    490 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    491 ; AVX512F-NEXT:    vzeroupper
    492 ; AVX512F-NEXT:    retq
    493 ;
    494 ; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
    495 ; AVX512VL:       # %bb.0:
    496 ; AVX512VL-NEXT:    kmovw %edi, %k1
    497 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    498 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
    499 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
    500 ; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
    501 ; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1
    502 ; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
    503 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
    504 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
    505 ; AVX512VL-NEXT:    vzeroupper
    506 ; AVX512VL-NEXT:    retq
    507 ;
    508 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
    509 ; VL_BW_DQ:       # %bb.0:
    510 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
    511 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    512 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
    513 ; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %ymm0
    514 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
    515 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
    516 ; VL_BW_DQ-NEXT:    vzeroupper
    517 ; VL_BW_DQ-NEXT:    retq
    518   %b = bitcast i8 %a to <8 x i1>
    519   %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
    520   ret <8 x i1> %c
    521 }
    522 
    523 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
    524 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
    525 ; AVX512F:       # %bb.0:
    526 ; AVX512F-NEXT:    kmovw %edi, %k1
    527 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    528 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    529 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
    530 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
    531 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
    532 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    533 ; AVX512F-NEXT:    kmovw %k0, %eax
    534 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
    535 ; AVX512F-NEXT:    vzeroupper
    536 ; AVX512F-NEXT:    retq
    537 ;
    538 ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
    539 ; AVX512VL:       # %bb.0:
    540 ; AVX512VL-NEXT:    kmovw %edi, %k1
    541 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    542 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
    543 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
    544 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    545 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
    546 ; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
    547 ; AVX512VL-NEXT:    vpslld $31, %ymm2, %ymm0
    548 ; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
    549 ; AVX512VL-NEXT:    kmovw %k0, %eax
    550 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
    551 ; AVX512VL-NEXT:    vzeroupper
    552 ; AVX512VL-NEXT:    retq
    553 ;
    554 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
    555 ; VL_BW_DQ:       # %bb.0:
    556 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
    557 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    558 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
    559 ; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    560 ; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
    561 ; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
    562 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
    563 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
    564 ; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
    565 ; VL_BW_DQ-NEXT:    vzeroupper
    566 ; VL_BW_DQ-NEXT:    retq
    567   %b = bitcast i8 %a to <8 x i1>
    568   %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
    569   %d = bitcast <8 x i1> %c to i8
    570   ret i8 %d
    571 }
    572 
    573 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
    574 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
    575 ; AVX512F:       # %bb.0:
    576 ; AVX512F-NEXT:    kmovw %edi, %k1
    577 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    578 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
    579 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    580 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
    581 ; AVX512F-NEXT:    kmovw %k0, %eax
    582 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
    583 ; AVX512F-NEXT:    vzeroupper
    584 ; AVX512F-NEXT:    retq
    585 ;
    586 ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
    587 ; AVX512VL:       # %bb.0:
    588 ; AVX512VL-NEXT:    kmovw %edi, %k1
    589 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    590 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
    591 ; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    592 ; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
    593 ; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
    594 ; AVX512VL-NEXT:    kmovw %k0, %eax
    595 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
    596 ; AVX512VL-NEXT:    vzeroupper
    597 ; AVX512VL-NEXT:    retq
    598 ;
    599 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
    600 ; VL_BW_DQ:       # %bb.0:
    601 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
    602 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    603 ; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
    604 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
    605 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
    606 ; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
    607 ; VL_BW_DQ-NEXT:    vzeroupper
    608 ; VL_BW_DQ-NEXT:    retq
    609   %b = bitcast i8 %a to <8 x i1>
    610   %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
    611   %d = bitcast <8 x i1> %c to i8
    612   ret i8 %d
    613 }
    614 
    615 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
    616 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
    617 ; AVX512F:       # %bb.0:
    618 ; AVX512F-NEXT:    kmovw %edi, %k1
    619 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    620 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    621 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
    622 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
    623 ; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
    624 ; AVX512F-NEXT:    kmovw %k0, %eax
    625 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
    626 ; AVX512F-NEXT:    vzeroupper
    627 ; AVX512F-NEXT:    retq
    628 ;
    629 ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
    630 ; AVX512VL:       # %bb.0:
    631 ; AVX512VL-NEXT:    kmovw %edi, %k1
    632 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    633 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
    634 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    635 ; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
    636 ; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
    637 ; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
    638 ; AVX512VL-NEXT:    kmovw %k0, %eax
    639 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
    640 ; AVX512VL-NEXT:    vzeroupper
    641 ; AVX512VL-NEXT:    retq
    642 ;
    643 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
    644 ; VL_BW_DQ:       # %bb.0:
    645 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
    646 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    647 ; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    648 ; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
    649 ; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
    650 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
    651 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
    652 ; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
    653 ; VL_BW_DQ-NEXT:    vzeroupper
    654 ; VL_BW_DQ-NEXT:    retq
    655   %b = bitcast i8 %a to <8 x i1>
    656   %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    657   %d = bitcast <8 x i1>%c to i8
    658   ret i8 %d
    659 }
    660 
    661 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
    662 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
    663 ; AVX512F:       # %bb.0:
    664 ; AVX512F-NEXT:    kmovw %edi, %k1
    665 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    666 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
    667 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    668 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    669 ; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
    670 ; AVX512F-NEXT:    kmovw %k0, %eax
    671 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
    672 ; AVX512F-NEXT:    vzeroupper
    673 ; AVX512F-NEXT:    retq
    674 ;
    675 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
    676 ; AVX512VL:       # %bb.0:
    677 ; AVX512VL-NEXT:    kmovw %edi, %k1
    678 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    679 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
    680 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
    681 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    682 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
    683 ; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
    684 ; AVX512VL-NEXT:    kmovw %k0, %eax
    685 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
    686 ; AVX512VL-NEXT:    vzeroupper
    687 ; AVX512VL-NEXT:    retq
    688 ;
    689 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
    690 ; VL_BW_DQ:       # %bb.0:
    691 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
    692 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    693 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
    694 ; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    695 ; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
    696 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
    697 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
    698 ; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
    699 ; VL_BW_DQ-NEXT:    vzeroupper
    700 ; VL_BW_DQ-NEXT:    retq
    701   %b = bitcast i8 %a to <8 x i1>
    702   %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
    703   %d = bitcast <8 x i1>%c to i8
    704   ret i8 %d
    705 }
    706 
    707 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
    708 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
    709 ; AVX512F:       # %bb.0:
    710 ; AVX512F-NEXT:    kmovw %edi, %k1
    711 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    712 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
    713 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
    714 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    715 ; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
    716 ; AVX512F-NEXT:    kmovw %k0, %eax
    717 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
    718 ; AVX512F-NEXT:    vzeroupper
    719 ; AVX512F-NEXT:    retq
    720 ;
    721 ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
    722 ; AVX512VL:       # %bb.0:
    723 ; AVX512VL-NEXT:    kmovw %edi, %k1
    724 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    725 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
    726 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    727 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
    728 ; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
    729 ; AVX512VL-NEXT:    kmovw %k0, %eax
    730 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
    731 ; AVX512VL-NEXT:    vzeroupper
    732 ; AVX512VL-NEXT:    retq
    733 ;
    734 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
    735 ; VL_BW_DQ:       # %bb.0:
    736 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
    737 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    738 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    739 ; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
    740 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
    741 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
    742 ; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
    743 ; VL_BW_DQ-NEXT:    vzeroupper
    744 ; VL_BW_DQ-NEXT:    retq
    745   %b = bitcast i8 %a to <8 x i1>
    746   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
    747   %c1 = bitcast <8 x i1>%c to i8
    748   ret i8 %c1
    749 }
    750 
    751 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
    752 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
    753 ; AVX512F:       # %bb.0:
    754 ; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
    755 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
    756 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
    757 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    758 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
    759 ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    760 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
    761 ; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
    762 ; AVX512F-NEXT:    kmovw %k0, %eax
    763 ; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
    764 ; AVX512F-NEXT:    vzeroupper
    765 ; AVX512F-NEXT:    retq
    766 ;
    767 ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
    768 ; AVX512VL:       # %bb.0:
    769 ; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
    770 ; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
    771 ; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
    772 ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
    773 ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
    774 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
    775 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
    776 ; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
    777 ; AVX512VL-NEXT:    kmovw %k0, %eax
    778 ; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
    779 ; AVX512VL-NEXT:    vzeroupper
    780 ; AVX512VL-NEXT:    retq
    781 ;
    782 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
    783 ; VL_BW_DQ:       # %bb.0:
    784 ; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
    785 ; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
    786 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
    787 ; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
    788 ; VL_BW_DQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    789 ; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
    790 ; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
    791 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
    792 ; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
    793 ; VL_BW_DQ-NEXT:    vzeroupper
    794 ; VL_BW_DQ-NEXT:    retq
    795   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
    796   %c1 = bitcast <8 x i1>%c to i8
    797   ret i8 %c1
    798 }
    799 
    800 
    801 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
    802 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
    803 ; AVX512F:       # %bb.0:
    804 ; AVX512F-NEXT:    kmovw %edi, %k1
    805 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    806 ; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
    807 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
    808 ; AVX512F-NEXT:    kmovw %k0, %eax
    809 ; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
    810 ; AVX512F-NEXT:    vzeroupper
    811 ; AVX512F-NEXT:    retq
    812 ;
    813 ; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
    814 ; AVX512VL:       # %bb.0:
    815 ; AVX512VL-NEXT:    kmovw %edi, %k1
    816 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    817 ; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
    818 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
    819 ; AVX512VL-NEXT:    kmovw %k0, %eax
    820 ; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
    821 ; AVX512VL-NEXT:    vzeroupper
    822 ; AVX512VL-NEXT:    retq
    823 ;
    824 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
    825 ; VL_BW_DQ:       # %bb.0:
    826 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
    827 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
    828 ; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
    829 ; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
    830 ; VL_BW_DQ-NEXT:    kmovd %k0, %eax
    831 ; VL_BW_DQ-NEXT:    # kill: def $ax killed $ax killed $eax
    832 ; VL_BW_DQ-NEXT:    vzeroupper
    833 ; VL_BW_DQ-NEXT:    retq
    834   %b = bitcast i16 %a to <16 x i1>
    835   %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
    836   %d = bitcast <16 x i1> %c to i16
    837   ret i16 %d
    838 }
    839 
    840 define i64 @shuf64i1_zero(i64 %a) {
    841 ; AVX512F-LABEL: shuf64i1_zero:
    842 ; AVX512F:       # %bb.0:
    843 ; AVX512F-NEXT:    kmovw %edi, %k1
    844 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    845 ; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
    846 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
    847 ; AVX512F-NEXT:    kmovw %k0, %eax
    848 ; AVX512F-NEXT:    kmovw %k0, %ecx
    849 ; AVX512F-NEXT:    shll $16, %ecx
    850 ; AVX512F-NEXT:    orl %eax, %ecx
    851 ; AVX512F-NEXT:    movq %rcx, %rax
    852 ; AVX512F-NEXT:    shlq $32, %rax
    853 ; AVX512F-NEXT:    orq %rcx, %rax
    854 ; AVX512F-NEXT:    vzeroupper
    855 ; AVX512F-NEXT:    retq
    856 ;
    857 ; AVX512VL-LABEL: shuf64i1_zero:
    858 ; AVX512VL:       # %bb.0:
    859 ; AVX512VL-NEXT:    kmovw %edi, %k1
    860 ; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
    861 ; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
    862 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
    863 ; AVX512VL-NEXT:    kmovw %k0, %eax
    864 ; AVX512VL-NEXT:    kmovw %k0, %ecx
    865 ; AVX512VL-NEXT:    shll $16, %ecx
    866 ; AVX512VL-NEXT:    orl %eax, %ecx
    867 ; AVX512VL-NEXT:    movq %rcx, %rax
    868 ; AVX512VL-NEXT:    shlq $32, %rax
    869 ; AVX512VL-NEXT:    orq %rcx, %rax
    870 ; AVX512VL-NEXT:    vzeroupper
    871 ; AVX512VL-NEXT:    retq
    872 ;
    873 ; VL_BW_DQ-LABEL: shuf64i1_zero:
    874 ; VL_BW_DQ:       # %bb.0:
    875 ; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
    876 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
    877 ; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
    878 ; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
    879 ; VL_BW_DQ-NEXT:    kmovq %k0, %rax
    880 ; VL_BW_DQ-NEXT:    vzeroupper
    881 ; VL_BW_DQ-NEXT:    retq
    882   %b = bitcast i64 %a to <64 x i1>
    883   %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
    884   %d = bitcast <64 x i1> %c to i64
    885   ret i64 %d
    886 }
    887