Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
     10 ;
     11 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
     12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
     13 
     14 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
     15 ; AVX1-LABEL: testv4i64:
     16 ; AVX1:       # %bb.0:
     17 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     18 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
     19 ; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
     20 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
     21 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
     22 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
     23 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     24 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
     25 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     26 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
     27 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
     28 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
     29 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
     30 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
     31 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
     32 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
     33 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
     34 ; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
     35 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
     36 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
     37 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
     38 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
     39 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
     40 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
     41 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
     42 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     43 ; AVX1-NEXT:    retq
     44 ;
     45 ; AVX2-LABEL: testv4i64:
     46 ; AVX2:       # %bb.0:
     47 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     48 ; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
     49 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
     50 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
     51 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
     52 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     53 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
     54 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     55 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
     56 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
     57 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
     58 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
     59 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
     60 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
     61 ; AVX2-NEXT:    retq
     62 ;
     63 ; AVX512CDVL-LABEL: testv4i64:
     64 ; AVX512CDVL:       # %bb.0:
     65 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     66 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
     67 ; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
     68 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
     69 ; AVX512CDVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
     70 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     71 ; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm3
     72 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     73 ; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
     74 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
     75 ; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
     76 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
     77 ; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
     78 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
     79 ; AVX512CDVL-NEXT:    retq
     80 ;
     81 ; AVX512CD-LABEL: testv4i64:
     82 ; AVX512CD:       # %bb.0:
     83 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     84 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
     85 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
     86 ; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
     87 ; AVX512CD-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
     88 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     89 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
     90 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     91 ; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
     92 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
     93 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
     94 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
     95 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
     96 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
     97 ; AVX512CD-NEXT:    retq
     98 ;
     99 ; AVX512VPOPCNTDQ-LABEL: testv4i64:
    100 ; AVX512VPOPCNTDQ:       # %bb.0:
    101 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    102 ; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    103 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
    104 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    105 ; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
    106 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
    107 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    108 ; AVX512VPOPCNTDQ-NEXT:    retq
    109 ;
    110 ; AVX512VPOPCNTDQVL-LABEL: testv4i64:
    111 ; AVX512VPOPCNTDQVL:       # %bb.0:
    112 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    113 ; AVX512VPOPCNTDQVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    114 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    115 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    116 ; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
    117 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %ymm0, %ymm0
    118 ; AVX512VPOPCNTDQVL-NEXT:    retq
    119 ;
    120 ; BITALG_NOVLX-LABEL: testv4i64:
    121 ; BITALG_NOVLX:       # %bb.0:
    122 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    123 ; BITALG_NOVLX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    124 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    125 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    126 ; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    127 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    128 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    129 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    130 ; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    131 ; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    132 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    133 ; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    134 ; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    135 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    136 ; BITALG_NOVLX-NEXT:    retq
    137 ;
    138 ; BITALG-LABEL: testv4i64:
    139 ; BITALG:       # %bb.0:
    140 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    141 ; BITALG-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    142 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    143 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    144 ; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    145 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    146 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
    147 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    148 ; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    149 ; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
    150 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    151 ; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    152 ; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    153 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    154 ; BITALG-NEXT:    retq
    155 ;
    156 ; X32-AVX-LABEL: testv4i64:
    157 ; X32-AVX:       # %bb.0:
    158 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    159 ; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    160 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    161 ; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
    162 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    163 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    164 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    165 ; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    166 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    167 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    168 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    169 ; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    170 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    171 ; X32-AVX-NEXT:    retl
    172   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
    173   ret <4 x i64> %out
    174 }
    175 
    176 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
    177 ; AVX1-LABEL: testv4i64u:
    178 ; AVX1:       # %bb.0:
    179 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    180 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    181 ; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
    182 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    183 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
    184 ; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
    185 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    186 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
    187 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    188 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    189 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    190 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
    191 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    192 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
    193 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
    194 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
    195 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
    196 ; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
    197 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
    198 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
    199 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    200 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    201 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    202 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    203 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
    204 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    205 ; AVX1-NEXT:    retq
    206 ;
    207 ; AVX2-LABEL: testv4i64u:
    208 ; AVX2:       # %bb.0:
    209 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    210 ; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    211 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    212 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    213 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    214 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    215 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
    216 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    217 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    218 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    219 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    220 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    221 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    222 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    223 ; AVX2-NEXT:    retq
    224 ;
    225 ; AVX512CDVL-LABEL: testv4i64u:
    226 ; AVX512CDVL:       # %bb.0:
    227 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    228 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    229 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    230 ; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
    231 ; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
    232 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
    233 ; AVX512CDVL-NEXT:    retq
    234 ;
    235 ; AVX512CD-LABEL: testv4i64u:
    236 ; AVX512CD:       # %bb.0:
    237 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    238 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    239 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    240 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
    241 ; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
    242 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
    243 ; AVX512CD-NEXT:    retq
    244 ;
    245 ; AVX512VPOPCNTDQ-LABEL: testv4i64u:
    246 ; AVX512VPOPCNTDQ:       # %bb.0:
    247 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    248 ; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    249 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
    250 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    251 ; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
    252 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
    253 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    254 ; AVX512VPOPCNTDQ-NEXT:    retq
    255 ;
    256 ; AVX512VPOPCNTDQVL-LABEL: testv4i64u:
    257 ; AVX512VPOPCNTDQVL:       # %bb.0:
    258 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    259 ; AVX512VPOPCNTDQVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    260 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    261 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    262 ; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
    263 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %ymm0, %ymm0
    264 ; AVX512VPOPCNTDQVL-NEXT:    retq
    265 ;
    266 ; BITALG_NOVLX-LABEL: testv4i64u:
    267 ; BITALG_NOVLX:       # %bb.0:
    268 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    269 ; BITALG_NOVLX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    270 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    271 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    272 ; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    273 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    274 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    275 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    276 ; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    277 ; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    278 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    279 ; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    280 ; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    281 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    282 ; BITALG_NOVLX-NEXT:    retq
    283 ;
    284 ; BITALG-LABEL: testv4i64u:
    285 ; BITALG:       # %bb.0:
    286 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    287 ; BITALG-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    288 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    289 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    290 ; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    291 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    292 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
    293 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    294 ; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    295 ; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
    296 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    297 ; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    298 ; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    299 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    300 ; BITALG-NEXT:    retq
    301 ;
    302 ; X32-AVX-LABEL: testv4i64u:
    303 ; X32-AVX:       # %bb.0:
    304 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    305 ; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    306 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    307 ; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
    308 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    309 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    310 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    311 ; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    312 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    313 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    314 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    315 ; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    316 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    317 ; X32-AVX-NEXT:    retl
    318   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
    319   ret <4 x i64> %out
    320 }
    321 
    322 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
    323 ; AVX1-LABEL: testv8i32:
    324 ; AVX1:       # %bb.0:
    325 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    326 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    327 ; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
    328 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    329 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
    330 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
    331 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    332 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
    333 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    334 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    335 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    336 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
    337 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    338 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
    339 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    340 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
    341 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    342 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
    343 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
    344 ; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
    345 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
    346 ; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
    347 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
    348 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
    349 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    350 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    351 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    352 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    353 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    354 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
    355 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    356 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
    357 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
    358 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    359 ; AVX1-NEXT:    retq
    360 ;
    361 ; AVX2-LABEL: testv8i32:
    362 ; AVX2:       # %bb.0:
    363 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    364 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    365 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    366 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    367 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    368 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    369 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
    370 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    371 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    372 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    373 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    374 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    375 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    376 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    377 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    378 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    379 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    380 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    381 ; AVX2-NEXT:    retq
    382 ;
    383 ; AVX512CDVL-LABEL: testv8i32:
    384 ; AVX512CDVL:       # %bb.0:
    385 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    386 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    387 ; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
    388 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    389 ; AVX512CDVL-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    390 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    391 ; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm3
    392 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    393 ; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    394 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    395 ; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
    396 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    397 ; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    398 ; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    399 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    400 ; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    401 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    402 ; AVX512CDVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    403 ; AVX512CDVL-NEXT:    retq
    404 ;
    405 ; AVX512CD-LABEL: testv8i32:
    406 ; AVX512CD:       # %bb.0:
    407 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    408 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    409 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
    410 ; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    411 ; AVX512CD-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    412 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    413 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
    414 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    415 ; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    416 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    417 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
    418 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    419 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    420 ; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    421 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    422 ; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    423 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    424 ; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    425 ; AVX512CD-NEXT:    retq
    426 ;
    427 ; AVX512VPOPCNTDQ-LABEL: testv8i32:
    428 ; AVX512VPOPCNTDQ:       # %bb.0:
    429 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    430 ; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    431 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
    432 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    433 ; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    434 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    435 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    436 ; AVX512VPOPCNTDQ-NEXT:    retq
    437 ;
    438 ; AVX512VPOPCNTDQVL-LABEL: testv8i32:
    439 ; AVX512VPOPCNTDQVL:       # %bb.0:
    440 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    441 ; AVX512VPOPCNTDQVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    442 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    443 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    444 ; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    445 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
    446 ; AVX512VPOPCNTDQVL-NEXT:    retq
    447 ;
    448 ; BITALG_NOVLX-LABEL: testv8i32:
    449 ; BITALG_NOVLX:       # %bb.0:
    450 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    451 ; BITALG_NOVLX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    452 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    453 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    454 ; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    455 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    456 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    457 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    458 ; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    459 ; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    460 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    461 ; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    462 ; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    463 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    464 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    465 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    466 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    467 ; BITALG_NOVLX-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    468 ; BITALG_NOVLX-NEXT:    retq
    469 ;
    470 ; BITALG-LABEL: testv8i32:
    471 ; BITALG:       # %bb.0:
    472 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    473 ; BITALG-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    474 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    475 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    476 ; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    477 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    478 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
    479 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    480 ; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    481 ; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
    482 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    483 ; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    484 ; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    485 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    486 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    487 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    488 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    489 ; BITALG-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    490 ; BITALG-NEXT:    retq
    491 ;
    492 ; X32-AVX-LABEL: testv8i32:
    493 ; X32-AVX:       # %bb.0:
    494 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    495 ; X32-AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    496 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    497 ; X32-AVX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    498 ; X32-AVX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    499 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    500 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    501 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    502 ; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    503 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    504 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    505 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    506 ; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    507 ; X32-AVX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    508 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    509 ; X32-AVX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    510 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    511 ; X32-AVX-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    512 ; X32-AVX-NEXT:    retl
    513   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
    514   ret <8 x i32> %out
    515 }
    516 
    517 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
    518 ; AVX1-LABEL: testv8i32u:
    519 ; AVX1:       # %bb.0:
    520 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    521 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    522 ; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
    523 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    524 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
    525 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
    526 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    527 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
    528 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    529 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    530 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    531 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
    532 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    533 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
    534 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    535 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
    536 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
    537 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
    538 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
    539 ; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
    540 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
    541 ; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
    542 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
    543 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
    544 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    545 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    546 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    547 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    548 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    549 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
    550 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    551 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
    552 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
    553 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    554 ; AVX1-NEXT:    retq
    555 ;
    556 ; AVX2-LABEL: testv8i32u:
    557 ; AVX2:       # %bb.0:
    558 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    559 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    560 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    561 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    562 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    563 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    564 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
    565 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    566 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    567 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    568 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    569 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    570 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    571 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    572 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    573 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    574 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    575 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    576 ; AVX2-NEXT:    retq
    577 ;
    578 ; AVX512CDVL-LABEL: testv8i32u:
    579 ; AVX512CDVL:       # %bb.0:
    580 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    581 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    582 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    583 ; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
    584 ; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
    585 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
    586 ; AVX512CDVL-NEXT:    retq
    587 ;
    588 ; AVX512CD-LABEL: testv8i32u:
    589 ; AVX512CD:       # %bb.0:
    590 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    591 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    592 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    593 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
    594 ; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
    595 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
    596 ; AVX512CD-NEXT:    retq
    597 ;
    598 ; AVX512VPOPCNTDQ-LABEL: testv8i32u:
    599 ; AVX512VPOPCNTDQ:       # %bb.0:
    600 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    601 ; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    602 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
    603 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    604 ; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    605 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    606 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    607 ; AVX512VPOPCNTDQ-NEXT:    retq
    608 ;
    609 ; AVX512VPOPCNTDQVL-LABEL: testv8i32u:
    610 ; AVX512VPOPCNTDQVL:       # %bb.0:
    611 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    612 ; AVX512VPOPCNTDQVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    613 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    614 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    615 ; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
    616 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
    617 ; AVX512VPOPCNTDQVL-NEXT:    retq
    618 ;
    619 ; BITALG_NOVLX-LABEL: testv8i32u:
    620 ; BITALG_NOVLX:       # %bb.0:
    621 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    622 ; BITALG_NOVLX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    623 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    624 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    625 ; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    626 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    627 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    628 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    629 ; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    630 ; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    631 ; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    632 ; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    633 ; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    634 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    635 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    636 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    637 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    638 ; BITALG_NOVLX-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    639 ; BITALG_NOVLX-NEXT:    retq
    640 ;
    641 ; BITALG-LABEL: testv8i32u:
    642 ; BITALG:       # %bb.0:
    643 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    644 ; BITALG-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    645 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    646 ; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    647 ; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    648 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    649 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
    650 ; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    651 ; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    652 ; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
    653 ; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
    654 ; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    655 ; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    656 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    657 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    658 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    659 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    660 ; BITALG-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    661 ; BITALG-NEXT:    retq
    662 ;
    663 ; X32-AVX-LABEL: testv8i32u:
    664 ; X32-AVX:       # %bb.0:
    665 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    666 ; X32-AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    667 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    668 ; X32-AVX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    669 ; X32-AVX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
    670 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    671 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
    672 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    673 ; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    674 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    675 ; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
    676 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    677 ; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    678 ; X32-AVX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    679 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    680 ; X32-AVX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    681 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    682 ; X32-AVX-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    683 ; X32-AVX-NEXT:    retl
    684   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
    685   ret <8 x i32> %out
    686 }
    687 
    688 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
    689 ; AVX1-LABEL: testv16i16:
    690 ; AVX1:       # %bb.0:
    691 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    692 ; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
    693 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
    694 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
    695 ; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    696 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    697 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
    698 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    699 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    700 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
    701 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
    702 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
    703 ; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
    704 ; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
    705 ; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
    706 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    707 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    708 ; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    709 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    710 ; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    711 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
    712 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    713 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    714 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    715 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    716 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
    717 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
    718 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    719 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
    720 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    721 ; AVX1-NEXT:    retq
    722 ;
    723 ; AVX2-LABEL: testv16i16:
    724 ; AVX2:       # %bb.0:
    725 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    726 ; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    727 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    728 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    729 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    730 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    731 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    732 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    733 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    734 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    735 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    736 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    737 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    738 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
    739 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    740 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
    741 ; AVX2-NEXT:    retq
    742 ;
    743 ; AVX512CDVL-LABEL: testv16i16:
    744 ; AVX512CDVL:       # %bb.0:
    745 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    746 ; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    747 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    748 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    749 ; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    750 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    751 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
    752 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    753 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    754 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    755 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    756 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    757 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    758 ; AVX512CDVL-NEXT:    vpsllw $8, %ymm0, %ymm1
    759 ; AVX512CDVL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    760 ; AVX512CDVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
    761 ; AVX512CDVL-NEXT:    retq
    762 ;
    763 ; AVX512CD-LABEL: testv16i16:
    764 ; AVX512CD:       # %bb.0:
    765 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    766 ; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    767 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    768 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    769 ; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    770 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    771 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
    772 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    773 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    774 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    775 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    776 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    777 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    778 ; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm1
    779 ; AVX512CD-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    780 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
    781 ; AVX512CD-NEXT:    retq
    782 ;
    783 ; AVX512VPOPCNTDQ-LABEL: testv16i16:
    784 ; AVX512VPOPCNTDQ:       # %bb.0:
    785 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    786 ; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    787 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
    788 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    789 ; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    790 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    791 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    792 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
    793 ; AVX512VPOPCNTDQ-NEXT:    retq
    794 ;
    795 ; AVX512VPOPCNTDQVL-LABEL: testv16i16:
    796 ; AVX512VPOPCNTDQVL:       # %bb.0:
    797 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    798 ; AVX512VPOPCNTDQVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    799 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    800 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    801 ; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    802 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    803 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
    804 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
    805 ; AVX512VPOPCNTDQVL-NEXT:    retq
    806 ;
    807 ; BITALG_NOVLX-LABEL: testv16i16:
    808 ; BITALG_NOVLX:       # %bb.0:
    809 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    810 ; BITALG_NOVLX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    811 ; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
    812 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    813 ; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    814 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
    815 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    816 ; BITALG_NOVLX-NEXT:    retq
    817 ;
    818 ; BITALG-LABEL: testv16i16:
    819 ; BITALG:       # %bb.0:
    820 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    821 ; BITALG-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    822 ; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
    823 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    824 ; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    825 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
    826 ; BITALG-NEXT:    retq
    827 ;
    828 ; X32-AVX-LABEL: testv16i16:
    829 ; X32-AVX:       # %bb.0:
    830 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    831 ; X32-AVX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    832 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
    833 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    834 ; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    835 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    836 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
    837 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    838 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    839 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
    840 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
    841 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    842 ; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    843 ; X32-AVX-NEXT:    vpsllw $8, %ymm0, %ymm1
    844 ; X32-AVX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    845 ; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
    846 ; X32-AVX-NEXT:    retl
    847   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
    848   ret <16 x i16> %out
    849 }
    850 
    851 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
    852 ; AVX1-LABEL: testv16i16u:
    853 ; AVX1:       # %bb.0:
    854 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    855 ; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
    856 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
    857 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
    858 ; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
    859 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    860 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
    861 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    862 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    863 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
    864 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
    865 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
    866 ; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
    867 ; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
    868 ; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
    869 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    870 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    871 ; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    872 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    873 ; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
    874 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
    875 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    876 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    877 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    878 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    879 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
    880 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
    881 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    882 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
    883 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    884 ; AVX1-NEXT:    retq
    885 ;
    886 ; AVX2-LABEL: testv16i16u:
    887 ; AVX2:       # %bb.0:
    888 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    889 ; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    890 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    891 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    892 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    893 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    894 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    895 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    896 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    897 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    898 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    899 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    900 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    901 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
    902 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    903 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
    904 ; AVX2-NEXT:    retq
    905 ;
    906 ; AVX512CDVL-LABEL: testv16i16u:
    907 ; AVX512CDVL:       # %bb.0:
    908 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    909 ; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    910 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    911 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    912 ; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    913 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    914 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
    915 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    916 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    917 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    918 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    919 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    920 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    921 ; AVX512CDVL-NEXT:    vpsllw $8, %ymm0, %ymm1
    922 ; AVX512CDVL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    923 ; AVX512CDVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
    924 ; AVX512CDVL-NEXT:    retq
    925 ;
    926 ; AVX512CD-LABEL: testv16i16u:
    927 ; AVX512CD:       # %bb.0:
    928 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    929 ; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    930 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    931 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    932 ; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    933 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    934 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
    935 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    936 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    937 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    938 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    939 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    940 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    941 ; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm1
    942 ; AVX512CD-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    943 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
    944 ; AVX512CD-NEXT:    retq
    945 ;
    946 ; AVX512VPOPCNTDQ-LABEL: testv16i16u:
    947 ; AVX512VPOPCNTDQ:       # %bb.0:
    948 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    949 ; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    950 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
    951 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    952 ; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    953 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    954 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    955 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
    956 ; AVX512VPOPCNTDQ-NEXT:    retq
    957 ;
    958 ; AVX512VPOPCNTDQVL-LABEL: testv16i16u:
    959 ; AVX512VPOPCNTDQVL:       # %bb.0:
    960 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    961 ; AVX512VPOPCNTDQVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    962 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
    963 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    964 ; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    965 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    966 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
    967 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
    968 ; AVX512VPOPCNTDQVL-NEXT:    retq
    969 ;
    970 ; BITALG_NOVLX-LABEL: testv16i16u:
    971 ; BITALG_NOVLX:       # %bb.0:
    972 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    973 ; BITALG_NOVLX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    974 ; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
    975 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    976 ; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    977 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
    978 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
    979 ; BITALG_NOVLX-NEXT:    retq
    980 ;
    981 ; BITALG-LABEL: testv16i16u:
    982 ; BITALG:       # %bb.0:
    983 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    984 ; BITALG-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    985 ; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
    986 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    987 ; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    988 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
    989 ; BITALG-NEXT:    retq
    990 ;
    991 ; X32-AVX-LABEL: testv16i16u:
    992 ; X32-AVX:       # %bb.0:
    993 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    994 ; X32-AVX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    995 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
    996 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
    997 ; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
    998 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    999 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1000 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1001 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1002 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1003 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1004 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1005 ; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1006 ; X32-AVX-NEXT:    vpsllw $8, %ymm0, %ymm1
   1007 ; X32-AVX-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
   1008 ; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
   1009 ; X32-AVX-NEXT:    retl
   1010   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
   1011   ret <16 x i16> %out
   1012 }
   1013 
   1014 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
   1015 ; AVX1-LABEL: testv32i8:
   1016 ; AVX1:       # %bb.0:
   1017 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1018 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1019 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
   1020 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   1021 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
   1022 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
   1023 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1024 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
   1025 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1026 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   1027 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1028 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   1029 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   1030 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
   1031 ; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
   1032 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   1033 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
   1034 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
   1035 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
   1036 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1037 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   1038 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   1039 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1040 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1041 ; AVX1-NEXT:    retq
   1042 ;
   1043 ; AVX2-LABEL: testv32i8:
   1044 ; AVX2:       # %bb.0:
   1045 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1046 ; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1047 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1048 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1049 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1050 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1051 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1052 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1053 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1054 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1055 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1056 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1057 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1058 ; AVX2-NEXT:    retq
   1059 ;
   1060 ; AVX512CDVL-LABEL: testv32i8:
   1061 ; AVX512CDVL:       # %bb.0:
   1062 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1063 ; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1064 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1065 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1066 ; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1067 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1068 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1069 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1070 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1071 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1072 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1073 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1074 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1075 ; AVX512CDVL-NEXT:    retq
   1076 ;
   1077 ; AVX512CD-LABEL: testv32i8:
   1078 ; AVX512CD:       # %bb.0:
   1079 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1080 ; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1081 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1082 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1083 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1084 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1085 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1086 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1087 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1088 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1089 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1090 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1091 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1092 ; AVX512CD-NEXT:    retq
   1093 ;
   1094 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
   1095 ; AVX512VPOPCNTDQ:       # %bb.0:
   1096 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1097 ; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1098 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1099 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1100 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1101 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1102 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1103 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1104 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1105 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1106 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1107 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1108 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1109 ; AVX512VPOPCNTDQ-NEXT:    retq
   1110 ;
   1111 ; AVX512VPOPCNTDQVL-LABEL: testv32i8:
   1112 ; AVX512VPOPCNTDQVL:       # %bb.0:
   1113 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1114 ; AVX512VPOPCNTDQVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1115 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1116 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1117 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1118 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1119 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1120 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1121 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1122 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1123 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1124 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1125 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1126 ; AVX512VPOPCNTDQVL-NEXT:    retq
   1127 ;
   1128 ; BITALG_NOVLX-LABEL: testv32i8:
   1129 ; BITALG_NOVLX:       # %bb.0:
   1130 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1131 ; BITALG_NOVLX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1132 ; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1133 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1134 ; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1135 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
   1136 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
   1137 ; BITALG_NOVLX-NEXT:    retq
   1138 ;
   1139 ; BITALG-LABEL: testv32i8:
   1140 ; BITALG:       # %bb.0:
   1141 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1142 ; BITALG-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1143 ; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1144 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1145 ; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1146 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
   1147 ; BITALG-NEXT:    retq
   1148 ;
   1149 ; X32-AVX-LABEL: testv32i8:
   1150 ; X32-AVX:       # %bb.0:
   1151 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1152 ; X32-AVX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1153 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1154 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1155 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1156 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1157 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1158 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1159 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1160 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1161 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1162 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1163 ; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1164 ; X32-AVX-NEXT:    retl
   1165   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
   1166   ret <32 x i8> %out
   1167 }
   1168 
   1169 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
   1170 ; AVX1-LABEL: testv32i8u:
   1171 ; AVX1:       # %bb.0:
   1172 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1173 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1174 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
   1175 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   1176 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
   1177 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
   1178 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1179 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
   1180 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1181 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   1182 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1183 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   1184 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   1185 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
   1186 ; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
   1187 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   1188 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
   1189 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
   1190 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
   1191 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1192 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   1193 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   1194 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1195 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1196 ; AVX1-NEXT:    retq
   1197 ;
   1198 ; AVX2-LABEL: testv32i8u:
   1199 ; AVX2:       # %bb.0:
   1200 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1201 ; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1202 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1203 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1204 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1205 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1206 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1207 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1208 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1209 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1210 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1211 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1212 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1213 ; AVX2-NEXT:    retq
   1214 ;
   1215 ; AVX512CDVL-LABEL: testv32i8u:
   1216 ; AVX512CDVL:       # %bb.0:
   1217 ; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1218 ; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1219 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1220 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1221 ; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1222 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1223 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1224 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1225 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1226 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1227 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1228 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1229 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1230 ; AVX512CDVL-NEXT:    retq
   1231 ;
   1232 ; AVX512CD-LABEL: testv32i8u:
   1233 ; AVX512CD:       # %bb.0:
   1234 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1235 ; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1236 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1237 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1238 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1239 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1240 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1241 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1242 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1243 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1244 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1245 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1246 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1247 ; AVX512CD-NEXT:    retq
   1248 ;
   1249 ; AVX512VPOPCNTDQ-LABEL: testv32i8u:
   1250 ; AVX512VPOPCNTDQ:       # %bb.0:
   1251 ; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1252 ; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1253 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1254 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1255 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1256 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1257 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1258 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1259 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1260 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1261 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1262 ; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1263 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1264 ; AVX512VPOPCNTDQ-NEXT:    retq
   1265 ;
   1266 ; AVX512VPOPCNTDQVL-LABEL: testv32i8u:
   1267 ; AVX512VPOPCNTDQVL:       # %bb.0:
   1268 ; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1269 ; AVX512VPOPCNTDQVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1270 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1271 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1272 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1273 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1274 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1275 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1276 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1277 ; AVX512VPOPCNTDQVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1278 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1279 ; AVX512VPOPCNTDQVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1280 ; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1281 ; AVX512VPOPCNTDQVL-NEXT:    retq
   1282 ;
   1283 ; BITALG_NOVLX-LABEL: testv32i8u:
   1284 ; BITALG_NOVLX:       # %bb.0:
   1285 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1286 ; BITALG_NOVLX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1287 ; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1288 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1289 ; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1290 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
   1291 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
   1292 ; BITALG_NOVLX-NEXT:    retq
   1293 ;
   1294 ; BITALG-LABEL: testv32i8u:
   1295 ; BITALG:       # %bb.0:
   1296 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1297 ; BITALG-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1298 ; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1299 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1300 ; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1301 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
   1302 ; BITALG-NEXT:    retq
   1303 ;
   1304 ; X32-AVX-LABEL: testv32i8u:
   1305 ; X32-AVX:       # %bb.0:
   1306 ; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1307 ; X32-AVX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
   1308 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1309 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
   1310 ; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
   1311 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1312 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1313 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1314 ; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1315 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1316 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1317 ; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
   1318 ; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
   1319 ; X32-AVX-NEXT:    retl
   1320   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
   1321   ret <32 x i8> %out
   1322 }
   1323 
   1324 define <4 x i64> @foldv4i64() nounwind {
   1325 ; AVX-LABEL: foldv4i64:
   1326 ; AVX:       # %bb.0:
   1327 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
   1328 ; AVX-NEXT:    retq
   1329 ;
   1330 ; BITALG_NOVLX-LABEL: foldv4i64:
   1331 ; BITALG_NOVLX:       # %bb.0:
   1332 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
   1333 ; BITALG_NOVLX-NEXT:    retq
   1334 ;
   1335 ; BITALG-LABEL: foldv4i64:
   1336 ; BITALG:       # %bb.0:
   1337 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
   1338 ; BITALG-NEXT:    retq
   1339 ;
   1340 ; X32-AVX-LABEL: foldv4i64:
   1341 ; X32-AVX:       # %bb.0:
   1342 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
   1343 ; X32-AVX-NEXT:    retl
   1344   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
   1345   ret <4 x i64> %out
   1346 }
   1347 
   1348 define <4 x i64> @foldv4i64u() nounwind {
   1349 ; AVX-LABEL: foldv4i64u:
   1350 ; AVX:       # %bb.0:
   1351 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
   1352 ; AVX-NEXT:    retq
   1353 ;
   1354 ; BITALG_NOVLX-LABEL: foldv4i64u:
   1355 ; BITALG_NOVLX:       # %bb.0:
   1356 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
   1357 ; BITALG_NOVLX-NEXT:    retq
   1358 ;
   1359 ; BITALG-LABEL: foldv4i64u:
   1360 ; BITALG:       # %bb.0:
   1361 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
   1362 ; BITALG-NEXT:    retq
   1363 ;
   1364 ; X32-AVX-LABEL: foldv4i64u:
   1365 ; X32-AVX:       # %bb.0:
   1366 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
   1367 ; X32-AVX-NEXT:    retl
   1368   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
   1369   ret <4 x i64> %out
   1370 }
   1371 
   1372 define <8 x i32> @foldv8i32() nounwind {
   1373 ; AVX-LABEL: foldv8i32:
   1374 ; AVX:       # %bb.0:
   1375 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1376 ; AVX-NEXT:    retq
   1377 ;
   1378 ; BITALG_NOVLX-LABEL: foldv8i32:
   1379 ; BITALG_NOVLX:       # %bb.0:
   1380 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1381 ; BITALG_NOVLX-NEXT:    retq
   1382 ;
   1383 ; BITALG-LABEL: foldv8i32:
   1384 ; BITALG:       # %bb.0:
   1385 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1386 ; BITALG-NEXT:    retq
   1387 ;
   1388 ; X32-AVX-LABEL: foldv8i32:
   1389 ; X32-AVX:       # %bb.0:
   1390 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1391 ; X32-AVX-NEXT:    retl
   1392   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
   1393   ret <8 x i32> %out
   1394 }
   1395 
   1396 define <8 x i32> @foldv8i32u() nounwind {
   1397 ; AVX-LABEL: foldv8i32u:
   1398 ; AVX:       # %bb.0:
   1399 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1400 ; AVX-NEXT:    retq
   1401 ;
   1402 ; BITALG_NOVLX-LABEL: foldv8i32u:
   1403 ; BITALG_NOVLX:       # %bb.0:
   1404 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1405 ; BITALG_NOVLX-NEXT:    retq
   1406 ;
   1407 ; BITALG-LABEL: foldv8i32u:
   1408 ; BITALG:       # %bb.0:
   1409 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1410 ; BITALG-NEXT:    retq
   1411 ;
   1412 ; X32-AVX-LABEL: foldv8i32u:
   1413 ; X32-AVX:       # %bb.0:
   1414 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
   1415 ; X32-AVX-NEXT:    retl
   1416   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
   1417   ret <8 x i32> %out
   1418 }
   1419 
   1420 define <16 x i16> @foldv16i16() nounwind {
   1421 ; AVX-LABEL: foldv16i16:
   1422 ; AVX:       # %bb.0:
   1423 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1424 ; AVX-NEXT:    retq
   1425 ;
   1426 ; BITALG_NOVLX-LABEL: foldv16i16:
   1427 ; BITALG_NOVLX:       # %bb.0:
   1428 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1429 ; BITALG_NOVLX-NEXT:    retq
   1430 ;
   1431 ; BITALG-LABEL: foldv16i16:
   1432 ; BITALG:       # %bb.0:
   1433 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1434 ; BITALG-NEXT:    retq
   1435 ;
   1436 ; X32-AVX-LABEL: foldv16i16:
   1437 ; X32-AVX:       # %bb.0:
   1438 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1439 ; X32-AVX-NEXT:    retl
   1440   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
   1441   ret <16 x i16> %out
   1442 }
   1443 
   1444 define <16 x i16> @foldv16i16u() nounwind {
   1445 ; AVX-LABEL: foldv16i16u:
   1446 ; AVX:       # %bb.0:
   1447 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1448 ; AVX-NEXT:    retq
   1449 ;
   1450 ; BITALG_NOVLX-LABEL: foldv16i16u:
   1451 ; BITALG_NOVLX:       # %bb.0:
   1452 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1453 ; BITALG_NOVLX-NEXT:    retq
   1454 ;
   1455 ; BITALG-LABEL: foldv16i16u:
   1456 ; BITALG:       # %bb.0:
   1457 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1458 ; BITALG-NEXT:    retq
   1459 ;
   1460 ; X32-AVX-LABEL: foldv16i16u:
   1461 ; X32-AVX:       # %bb.0:
   1462 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
   1463 ; X32-AVX-NEXT:    retl
   1464   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
   1465   ret <16 x i16> %out
   1466 }
   1467 
   1468 define <32 x i8> @foldv32i8() nounwind {
   1469 ; AVX-LABEL: foldv32i8:
   1470 ; AVX:       # %bb.0:
   1471 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1472 ; AVX-NEXT:    retq
   1473 ;
   1474 ; BITALG_NOVLX-LABEL: foldv32i8:
   1475 ; BITALG_NOVLX:       # %bb.0:
   1476 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1477 ; BITALG_NOVLX-NEXT:    retq
   1478 ;
   1479 ; BITALG-LABEL: foldv32i8:
   1480 ; BITALG:       # %bb.0:
   1481 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1482 ; BITALG-NEXT:    retq
   1483 ;
   1484 ; X32-AVX-LABEL: foldv32i8:
   1485 ; X32-AVX:       # %bb.0:
   1486 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1487 ; X32-AVX-NEXT:    retl
   1488   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
   1489   ret <32 x i8> %out
   1490 }
   1491 
   1492 define <32 x i8> @foldv32i8u() nounwind {
   1493 ; AVX-LABEL: foldv32i8u:
   1494 ; AVX:       # %bb.0:
   1495 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1496 ; AVX-NEXT:    retq
   1497 ;
   1498 ; BITALG_NOVLX-LABEL: foldv32i8u:
   1499 ; BITALG_NOVLX:       # %bb.0:
   1500 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1501 ; BITALG_NOVLX-NEXT:    retq
   1502 ;
   1503 ; BITALG-LABEL: foldv32i8u:
   1504 ; BITALG:       # %bb.0:
   1505 ; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1506 ; BITALG-NEXT:    retq
   1507 ;
   1508 ; X32-AVX-LABEL: foldv32i8u:
   1509 ; X32-AVX:       # %bb.0:
   1510 ; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
   1511 ; X32-AVX-NEXT:    retl
   1512   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
   1513   ret <32 x i8> %out
   1514 }
   1515 
   1516 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
   1517 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
   1518 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
   1519 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
   1520