Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
      7 
      8 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
      9 ; AVX512CD-LABEL: testv8i64:
     10 ; AVX512CD:       # %bb.0:
     11 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     12 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
     13 ; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
     14 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
     15 ; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
     16 ; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
     17 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     18 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
     19 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     20 ; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
     21 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
     22 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
     23 ; AVX512CD-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
     24 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
     25 ; AVX512CD-NEXT:    vpxor %xmm3, %xmm3, %xmm3
     26 ; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm1, %ymm1
     27 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm5
     28 ; AVX512CD-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
     29 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
     30 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
     31 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
     32 ; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
     33 ; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
     34 ; AVX512CD-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
     35 ; AVX512CD-NEXT:    retq
     36 ;
     37 ; AVX512CDBW-LABEL: testv8i64:
     38 ; AVX512CDBW:       # %bb.0:
     39 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     40 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
     41 ; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
     42 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
     43 ; AVX512CDBW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
     44 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     45 ; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
     46 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     47 ; AVX512CDBW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
     48 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
     49 ; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
     50 ; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
     51 ; AVX512CDBW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
     52 ; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
     53 ; AVX512CDBW-NEXT:    retq
     54 ;
     55 ; AVX512BW-LABEL: testv8i64:
     56 ; AVX512BW:       # %bb.0:
     57 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     58 ; AVX512BW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
     59 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
     60 ; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
     61 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
     62 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     63 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
     64 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     65 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
     66 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
     67 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
     68 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
     69 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
     70 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
     71 ; AVX512BW-NEXT:    retq
     72 ;
     73 ; AVX512VPOPCNTDQ-LABEL: testv8i64:
     74 ; AVX512VPOPCNTDQ:       # %bb.0:
     75 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     76 ; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
     77 ; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
     78 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
     79 ; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
     80 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
     81 ; AVX512VPOPCNTDQ-NEXT:    retq
     82 ;
     83 ; BITALG-LABEL: testv8i64:
     84 ; BITALG:       # %bb.0:
     85 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     86 ; BITALG-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
     87 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
     88 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
     89 ; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
     90 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     91 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
     92 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     93 ; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
     94 ; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
     95 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
     96 ; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
     97 ; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
     98 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
     99 ; BITALG-NEXT:    retq
    100   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
    101   ret <8 x i64> %out
    102 }
    103 
    104 define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
    105 ; AVX512CD-LABEL: testv8i64u:
    106 ; AVX512CD:       # %bb.0:
    107 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    108 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
    109 ; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    110 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
    111 ; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
    112 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
    113 ; AVX512CD-NEXT:    retq
    114 ;
    115 ; AVX512CDBW-LABEL: testv8i64u:
    116 ; AVX512CDBW:       # %bb.0:
    117 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    118 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
    119 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    120 ; AVX512CDBW-NEXT:    vplzcntq %zmm0, %zmm0
    121 ; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
    122 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
    123 ; AVX512CDBW-NEXT:    retq
    124 ;
    125 ; AVX512BW-LABEL: testv8i64u:
    126 ; AVX512BW:       # %bb.0:
    127 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    128 ; AVX512BW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
    129 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    130 ; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    131 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    132 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    133 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
    134 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    135 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
    136 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    137 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    138 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
    139 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
    140 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
    141 ; AVX512BW-NEXT:    retq
    142 ;
    143 ; AVX512VPOPCNTDQ-LABEL: testv8i64u:
    144 ; AVX512VPOPCNTDQ:       # %bb.0:
    145 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    146 ; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
    147 ; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    148 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    149 ; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
    150 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
    151 ; AVX512VPOPCNTDQ-NEXT:    retq
    152 ;
    153 ; BITALG-LABEL: testv8i64u:
    154 ; BITALG:       # %bb.0:
    155 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    156 ; BITALG-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
    157 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    158 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    159 ; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    160 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    161 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
    162 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    163 ; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
    164 ; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
    165 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    166 ; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
    167 ; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
    168 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
    169 ; BITALG-NEXT:    retq
    170   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
    171   ret <8 x i64> %out
    172 }
    173 
    174 define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
    175 ; AVX512CD-LABEL: testv16i32:
    176 ; AVX512CD:       # %bb.0:
    177 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    178 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
    179 ; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    180 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    181 ; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    182 ; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    183 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    184 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
    185 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    186 ; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    187 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
    188 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
    189 ; AVX512CD-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
    190 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
    191 ; AVX512CD-NEXT:    vpxor %xmm3, %xmm3, %xmm3
    192 ; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
    193 ; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm5, %ymm5
    194 ; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
    195 ; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm1, %ymm1
    196 ; AVX512CD-NEXT:    vpackuswb %ymm5, %ymm1, %ymm1
    197 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm5
    198 ; AVX512CD-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
    199 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    200 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
    201 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    202 ; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
    203 ; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
    204 ; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm2, %ymm2
    205 ; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
    206 ; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
    207 ; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    208 ; AVX512CD-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
    209 ; AVX512CD-NEXT:    retq
    210 ;
    211 ; AVX512CDBW-LABEL: testv16i32:
    212 ; AVX512CDBW:       # %bb.0:
    213 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    214 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
    215 ; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    216 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    217 ; AVX512CDBW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
    218 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    219 ; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
    220 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    221 ; AVX512CDBW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
    222 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    223 ; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    224 ; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
    225 ; AVX512CDBW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
    226 ; AVX512CDBW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    227 ; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
    228 ; AVX512CDBW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    229 ; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
    230 ; AVX512CDBW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
    231 ; AVX512CDBW-NEXT:    retq
    232 ;
    233 ; AVX512BW-LABEL: testv16i32:
    234 ; AVX512BW:       # %bb.0:
    235 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    236 ; AVX512BW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
    237 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    238 ; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    239 ; AVX512BW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
    240 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    241 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
    242 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    243 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
    244 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    245 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    246 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
    247 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
    248 ; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    249 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
    250 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    251 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
    252 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
    253 ; AVX512BW-NEXT:    retq
    254 ;
    255 ; AVX512VPOPCNTDQ-LABEL: testv16i32:
    256 ; AVX512VPOPCNTDQ:       # %bb.0:
    257 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    258 ; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
    259 ; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    260 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    261 ; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    262 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    263 ; AVX512VPOPCNTDQ-NEXT:    retq
    264 ;
    265 ; BITALG-LABEL: testv16i32:
    266 ; BITALG:       # %bb.0:
    267 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    268 ; BITALG-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
    269 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    270 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    271 ; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
    272 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    273 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
    274 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    275 ; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
    276 ; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
    277 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    278 ; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
    279 ; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
    280 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    281 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
    282 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    283 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
    284 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
    285 ; BITALG-NEXT:    retq
    286   %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
    287   ret <16 x i32> %out
    288 }
    289 
    290 define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
    291 ; AVX512CD-LABEL: testv16i32u:
    292 ; AVX512CD:       # %bb.0:
    293 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    294 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
    295 ; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    296 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
    297 ; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
    298 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
    299 ; AVX512CD-NEXT:    retq
    300 ;
    301 ; AVX512CDBW-LABEL: testv16i32u:
    302 ; AVX512CDBW:       # %bb.0:
    303 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    304 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
    305 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    306 ; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
    307 ; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
    308 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
    309 ; AVX512CDBW-NEXT:    retq
    310 ;
    311 ; AVX512BW-LABEL: testv16i32u:
    312 ; AVX512BW:       # %bb.0:
    313 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    314 ; AVX512BW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
    315 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    316 ; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    317 ; AVX512BW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
    318 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    319 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
    320 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    321 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
    322 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    323 ; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    324 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
    325 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
    326 ; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    327 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
    328 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    329 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
    330 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
    331 ; AVX512BW-NEXT:    retq
    332 ;
    333 ; AVX512VPOPCNTDQ-LABEL: testv16i32u:
    334 ; AVX512VPOPCNTDQ:       # %bb.0:
    335 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    336 ; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
    337 ; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    338 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    339 ; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
    340 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    341 ; AVX512VPOPCNTDQ-NEXT:    retq
    342 ;
    343 ; BITALG-LABEL: testv16i32u:
    344 ; BITALG:       # %bb.0:
    345 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    346 ; BITALG-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
    347 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    348 ; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
    349 ; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
    350 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    351 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
    352 ; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    353 ; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
    354 ; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
    355 ; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
    356 ; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
    357 ; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
    358 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
    359 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
    360 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
    361 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
    362 ; BITALG-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
    363 ; BITALG-NEXT:    retq
    364   %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
    365   ret <16 x i32> %out
    366 }
    367 
    368 define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
    369 ; AVX512CD-LABEL: testv32i16:
    370 ; AVX512CD:       # %bb.0:
    371 ; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    372 ; AVX512CD-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
    373 ; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
    374 ; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    375 ; AVX512CD-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
    376 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    377 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
    378 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    379 ; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
    380 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    381 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
    382 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
    383 ; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
    384 ; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm5
    385 ; AVX512CD-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
    386 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
    387 ; AVX512CD-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
    388 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
    389 ; AVX512CD-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
    390 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
    391 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
    392 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
    393 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
    394 ; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
    395 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    396 ; AVX512CD-NEXT:    vpsllw $8, %ymm1, %ymm2
    397 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
    398 ; AVX512CD-NEXT:    vpsrlw $8, %ymm1, %ymm1
    399 ; AVX512CD-NEXT:    retq
    400 ;
    401 ; AVX512CDBW-LABEL: testv32i16:
    402 ; AVX512CDBW:       # %bb.0:
    403 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    404 ; AVX512CDBW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
    405 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    406 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    407 ; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    408 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    409 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    410 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    411 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    412 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    413 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    414 ; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    415 ; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    416 ; AVX512CDBW-NEXT:    vpsllw $8, %zmm0, %zmm1
    417 ; AVX512CDBW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
    418 ; AVX512CDBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
    419 ; AVX512CDBW-NEXT:    retq
    420 ;
    421 ; AVX512BW-LABEL: testv32i16:
    422 ; AVX512BW:       # %bb.0:
    423 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    424 ; AVX512BW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
    425 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    426 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    427 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    428 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    429 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    430 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    431 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    432 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    433 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    434 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    435 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    436 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm1
    437 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
    438 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
    439 ; AVX512BW-NEXT:    retq
    440 ;
    441 ; AVX512VPOPCNTDQ-LABEL: testv32i16:
    442 ; AVX512VPOPCNTDQ:       # %bb.0:
    443 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    444 ; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
    445 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
    446 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    447 ; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
    448 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    449 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    450 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
    451 ; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
    452 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
    453 ; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
    454 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
    455 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm1, %zmm1
    456 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm1, %ymm1
    457 ; AVX512VPOPCNTDQ-NEXT:    retq
    458 ;
    459 ; BITALG-LABEL: testv32i16:
    460 ; BITALG:       # %bb.0:
    461 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    462 ; BITALG-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
    463 ; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    464 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    465 ; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    466 ; BITALG-NEXT:    vpopcntw %zmm0, %zmm0
    467 ; BITALG-NEXT:    retq
    468   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
    469   ret <32 x i16> %out
    470 }
    471 
    472 define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
    473 ; AVX512CD-LABEL: testv32i16u:
    474 ; AVX512CD:       # %bb.0:
    475 ; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    476 ; AVX512CD-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
    477 ; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
    478 ; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    479 ; AVX512CD-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
    480 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    481 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
    482 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    483 ; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
    484 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    485 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
    486 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
    487 ; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
    488 ; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm5
    489 ; AVX512CD-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
    490 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
    491 ; AVX512CD-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
    492 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
    493 ; AVX512CD-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
    494 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
    495 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
    496 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
    497 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
    498 ; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
    499 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    500 ; AVX512CD-NEXT:    vpsllw $8, %ymm1, %ymm2
    501 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
    502 ; AVX512CD-NEXT:    vpsrlw $8, %ymm1, %ymm1
    503 ; AVX512CD-NEXT:    retq
    504 ;
    505 ; AVX512CDBW-LABEL: testv32i16u:
    506 ; AVX512CDBW:       # %bb.0:
    507 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    508 ; AVX512CDBW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
    509 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    510 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    511 ; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    512 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    513 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    514 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    515 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    516 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    517 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    518 ; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    519 ; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    520 ; AVX512CDBW-NEXT:    vpsllw $8, %zmm0, %zmm1
    521 ; AVX512CDBW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
    522 ; AVX512CDBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
    523 ; AVX512CDBW-NEXT:    retq
    524 ;
    525 ; AVX512BW-LABEL: testv32i16u:
    526 ; AVX512BW:       # %bb.0:
    527 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    528 ; AVX512BW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
    529 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    530 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    531 ; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    532 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    533 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    534 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    535 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    536 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    537 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    538 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    539 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    540 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm1
    541 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
    542 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
    543 ; AVX512BW-NEXT:    retq
    544 ;
    545 ; AVX512VPOPCNTDQ-LABEL: testv32i16u:
    546 ; AVX512VPOPCNTDQ:       # %bb.0:
    547 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    548 ; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
    549 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
    550 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    551 ; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
    552 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    553 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    554 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
    555 ; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
    556 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
    557 ; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
    558 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
    559 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm1, %zmm1
    560 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm1, %ymm1
    561 ; AVX512VPOPCNTDQ-NEXT:    retq
    562 ;
    563 ; BITALG-LABEL: testv32i16u:
    564 ; BITALG:       # %bb.0:
    565 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    566 ; BITALG-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
    567 ; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    568 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    569 ; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
    570 ; BITALG-NEXT:    vpopcntw %zmm0, %zmm0
    571 ; BITALG-NEXT:    retq
    572   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
    573   ret <32 x i16> %out
    574 }
    575 
    576 define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
    577 ; AVX512CD-LABEL: testv64i8:
    578 ; AVX512CD:       # %bb.0:
    579 ; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    580 ; AVX512CD-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
    581 ; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
    582 ; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    583 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    584 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    585 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
    586 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    587 ; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
    588 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    589 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
    590 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
    591 ; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
    592 ; AVX512CD-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
    593 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
    594 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
    595 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
    596 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
    597 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
    598 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
    599 ; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
    600 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    601 ; AVX512CD-NEXT:    retq
    602 ;
    603 ; AVX512CDBW-LABEL: testv64i8:
    604 ; AVX512CDBW:       # %bb.0:
    605 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    606 ; AVX512CDBW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
    607 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    608 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    609 ; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    610 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    611 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    612 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    613 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    614 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    615 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    616 ; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    617 ; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    618 ; AVX512CDBW-NEXT:    retq
    619 ;
    620 ; AVX512BW-LABEL: testv64i8:
    621 ; AVX512BW:       # %bb.0:
    622 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    623 ; AVX512BW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
    624 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    625 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    626 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    627 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    628 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    629 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    630 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    631 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    632 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    633 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    634 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    635 ; AVX512BW-NEXT:    retq
    636 ;
    637 ; AVX512VPOPCNTDQ-LABEL: testv64i8:
    638 ; AVX512VPOPCNTDQ:       # %bb.0:
    639 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    640 ; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
    641 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
    642 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    643 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    644 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    645 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
    646 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    647 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
    648 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
    649 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
    650 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
    651 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
    652 ; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
    653 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
    654 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
    655 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
    656 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
    657 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
    658 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
    659 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
    660 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    661 ; AVX512VPOPCNTDQ-NEXT:    retq
    662 ;
    663 ; BITALG-LABEL: testv64i8:
    664 ; BITALG:       # %bb.0:
    665 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    666 ; BITALG-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
    667 ; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    668 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    669 ; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    670 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
    671 ; BITALG-NEXT:    retq
    672   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
    673   ret <64 x i8> %out
    674 }
    675 
    676 define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
    677 ; AVX512CD-LABEL: testv64i8u:
    678 ; AVX512CD:       # %bb.0:
    679 ; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    680 ; AVX512CD-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
    681 ; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
    682 ; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    683 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    684 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    685 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
    686 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    687 ; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
    688 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    689 ; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
    690 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
    691 ; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
    692 ; AVX512CD-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
    693 ; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
    694 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
    695 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
    696 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
    697 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
    698 ; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
    699 ; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
    700 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    701 ; AVX512CD-NEXT:    retq
    702 ;
    703 ; AVX512CDBW-LABEL: testv64i8u:
    704 ; AVX512CDBW:       # %bb.0:
    705 ; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    706 ; AVX512CDBW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
    707 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    708 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    709 ; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    710 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    711 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    712 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    713 ; AVX512CDBW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    714 ; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    715 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    716 ; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    717 ; AVX512CDBW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    718 ; AVX512CDBW-NEXT:    retq
    719 ;
    720 ; AVX512BW-LABEL: testv64i8u:
    721 ; AVX512BW:       # %bb.0:
    722 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    723 ; AVX512BW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
    724 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    725 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    726 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    727 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    728 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
    729 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    730 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
    731 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
    732 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    733 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
    734 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
    735 ; AVX512BW-NEXT:    retq
    736 ;
    737 ; AVX512VPOPCNTDQ-LABEL: testv64i8u:
    738 ; AVX512VPOPCNTDQ:       # %bb.0:
    739 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    740 ; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
    741 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
    742 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
    743 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    744 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    745 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
    746 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    747 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
    748 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
    749 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
    750 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
    751 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
    752 ; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
    753 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
    754 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
    755 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
    756 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
    757 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
    758 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
    759 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
    760 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
    761 ; AVX512VPOPCNTDQ-NEXT:    retq
    762 ;
    763 ; BITALG-LABEL: testv64i8u:
    764 ; BITALG:       # %bb.0:
    765 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    766 ; BITALG-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
    767 ; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
    768 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
    769 ; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
    770 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
    771 ; BITALG-NEXT:    retq
    772   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
    773   ret <64 x i8> %out
    774 }
    775 
    776 declare <8 x i64> @llvm.cttz.v8i64(<8 x i64>, i1)
    777 declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
    778 declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>, i1)
    779 declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>, i1)
    780