Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
      6 
      7 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
      8 ; AVX1-LABEL: testv4i64:
      9 ; AVX1:       # BB#0:
     10 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     11 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
     12 ; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
     13 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
     14 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
     15 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
     16 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     17 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
     18 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
     19 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     20 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
     21 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     22 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
     23 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
     24 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
     25 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
     26 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
     27 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
     28 ; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
     29 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
     30 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
     31 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
     32 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
     33 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
     34 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
     35 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
     36 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     37 ; AVX1-NEXT:    retq
     38 ;
     39 ; AVX2-LABEL: testv4i64:
     40 ; AVX2:       # BB#0:
     41 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
     42 ; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
     43 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
     44 ; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
     45 ; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
     46 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     47 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
     48 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     49 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
     50 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
     51 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
     52 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
     53 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
     54 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
     55 ; AVX2-NEXT:    retq
     56 ;
     57 ; AVX512CDVL-LABEL: testv4i64:
     58 ; AVX512CDVL:       # BB#0:
     59 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
     60 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
     61 ; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm0
     62 ; AVX512CDVL-NEXT:    vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0
     63 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     64 ; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm3
     65 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     66 ; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
     67 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
     68 ; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm0
     69 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
     70 ; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
     71 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
     72 ; AVX512CDVL-NEXT:    retq
     73 ;
     74 ; AVX512CD-LABEL: testv4i64:
     75 ; AVX512CD:       # BB#0:
     76 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
     77 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
     78 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
     79 ; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
     80 ; AVX512CD-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
     81 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     82 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
     83 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     84 ; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
     85 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
     86 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
     87 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
     88 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
     89 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
     90 ; AVX512CD-NEXT:    retq
     91   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
     92   ret <4 x i64> %out
     93 }
     94 
     95 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
     96 ; AVX1-LABEL: testv4i64u:
     97 ; AVX1:       # BB#0:
     98 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
     99 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    100 ; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
    101 ; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
    102 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
    103 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
    104 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    105 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
    106 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
    107 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    108 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
    109 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    110 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    111 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    112 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
    113 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    114 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
    115 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
    116 ; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
    117 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
    118 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
    119 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    120 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    121 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    122 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    123 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
    124 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    125 ; AVX1-NEXT:    retq
    126 ;
    127 ; AVX2-LABEL: testv4i64u:
    128 ; AVX2:       # BB#0:
    129 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    130 ; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
    131 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    132 ; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
    133 ; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
    134 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    135 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
    136 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    137 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    138 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    139 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    140 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    141 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    142 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    143 ; AVX2-NEXT:    retq
    144 ;
    145 ; AVX512CDVL-LABEL: testv4i64u:
    146 ; AVX512CDVL:       # BB#0:
    147 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
    148 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    149 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    150 ; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
    151 ; AVX512CDVL-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
    152 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
    153 ; AVX512CDVL-NEXT:    retq
    154 ;
    155 ; AVX512CD-LABEL: testv4i64u:
    156 ; AVX512CD:       # BB#0:
    157 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    158 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
    159 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    160 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
    161 ; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
    162 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
    163 ; AVX512CD-NEXT:    retq
    164   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
    165   ret <4 x i64> %out
    166 }
    167 
    168 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
    169 ; AVX1-LABEL: testv8i32:
    170 ; AVX1:       # BB#0:
    171 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    172 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    173 ; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
    174 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
    175 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
    176 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
    177 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    178 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
    179 ; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
    180 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    181 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
    182 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    183 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    184 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
    185 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
    186 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
    187 ; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
    188 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    189 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
    190 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    191 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    192 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
    193 ; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
    194 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
    195 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
    196 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    197 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    198 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    199 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    200 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    201 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
    202 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    203 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    204 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
    205 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    206 ; AVX1-NEXT:    retq
    207 ;
    208 ; AVX2-LABEL: testv8i32:
    209 ; AVX2:       # BB#0:
    210 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    211 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    212 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    213 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
    214 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
    215 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    216 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
    217 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    218 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    219 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    220 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    221 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    222 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    223 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    224 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    225 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    226 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    227 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    228 ; AVX2-NEXT:    retq
    229 ;
    230 ; AVX512CDVL-LABEL: testv8i32:
    231 ; AVX512CDVL:       # BB#0:
    232 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
    233 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    234 ; AVX512CDVL-NEXT:    vpandd %ymm2, %ymm0, %ymm0
    235 ; AVX512CDVL-NEXT:    vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0
    236 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    237 ; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm3
    238 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    239 ; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    240 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    241 ; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm0
    242 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    243 ; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    244 ; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    245 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    246 ; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    247 ; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    248 ; AVX512CDVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    249 ; AVX512CDVL-NEXT:    retq
    250 ;
    251 ; AVX512CD-LABEL: testv8i32:
    252 ; AVX512CD:       # BB#0:
    253 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    254 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    255 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
    256 ; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
    257 ; AVX512CD-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
    258 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    259 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
    260 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    261 ; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    262 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    263 ; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
    264 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    265 ; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    266 ; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    267 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    268 ; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    269 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    270 ; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    271 ; AVX512CD-NEXT:    retq
    272   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
    273   ret <8 x i32> %out
    274 }
    275 
    276 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
    277 ; AVX1-LABEL: testv8i32u:
    278 ; AVX1:       # BB#0:
    279 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    280 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    281 ; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
    282 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
    283 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
    284 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
    285 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
    286 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
    287 ; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
    288 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    289 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
    290 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    291 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
    292 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
    293 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
    294 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
    295 ; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
    296 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    297 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
    298 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    299 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    300 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
    301 ; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
    302 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
    303 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
    304 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    305 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
    306 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    307 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    308 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    309 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
    310 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    311 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    312 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
    313 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
    314 ; AVX1-NEXT:    retq
    315 ;
    316 ; AVX2-LABEL: testv8i32u:
    317 ; AVX2:       # BB#0:
    318 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    319 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
    320 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    321 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
    322 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
    323 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    324 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
    325 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    326 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
    327 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    328 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
    329 ; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
    330 ; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
    331 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
    332 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
    333 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
    334 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
    335 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
    336 ; AVX2-NEXT:    retq
    337 ;
    338 ; AVX512CDVL-LABEL: testv8i32u:
    339 ; AVX512CDVL:       # BB#0:
    340 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
    341 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    342 ; AVX512CDVL-NEXT:    vpandd %ymm1, %ymm0, %ymm0
    343 ; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
    344 ; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
    345 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
    346 ; AVX512CDVL-NEXT:    retq
    347 ;
    348 ; AVX512CD-LABEL: testv8i32u:
    349 ; AVX512CD:       # BB#0:
    350 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    351 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
    352 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    353 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
    354 ; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
    355 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
    356 ; AVX512CD-NEXT:    retq
    357   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
    358   ret <8 x i32> %out
    359 }
    360 
    361 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
    362 ; AVX1-LABEL: testv16i16:
    363 ; AVX1:       # BB#0:
    364 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    365 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    366 ; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
    367 ; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
    368 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    369 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
    370 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
    371 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
    372 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    373 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
    374 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    375 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
    376 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
    377 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    378 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
    379 ; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
    380 ; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
    381 ; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
    382 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    383 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    384 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
    385 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
    386 ; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
    387 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    388 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    389 ; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
    390 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
    391 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
    392 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    393 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
    394 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    395 ; AVX1-NEXT:    retq
    396 ;
    397 ; AVX2-LABEL: testv16i16:
    398 ; AVX2:       # BB#0:
    399 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    400 ; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    401 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    402 ; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
    403 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    404 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    405 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    406 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    407 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    408 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    409 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    410 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    411 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
    412 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    413 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
    414 ; AVX2-NEXT:    retq
    415 ;
    416 ; AVX512CDVL-LABEL: testv16i16:
    417 ; AVX512CDVL:       # BB#0:
    418 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
    419 ; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    420 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    421 ; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
    422 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    423 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
    424 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    425 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    426 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    427 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    428 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    429 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    430 ; AVX512CDVL-NEXT:    vpsllw $8, %ymm0, %ymm1
    431 ; AVX512CDVL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    432 ; AVX512CDVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
    433 ; AVX512CDVL-NEXT:    retq
    434 ;
    435 ; AVX512CD-LABEL: testv16i16:
    436 ; AVX512CD:       # BB#0:
    437 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    438 ; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    439 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    440 ; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
    441 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    442 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
    443 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    444 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    445 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    446 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    447 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    448 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    449 ; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm1
    450 ; AVX512CD-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    451 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
    452 ; AVX512CD-NEXT:    retq
    453   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
    454   ret <16 x i16> %out
    455 }
    456 
    457 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
    458 ; AVX1-LABEL: testv16i16u:
    459 ; AVX1:       # BB#0:
    460 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    461 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    462 ; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
    463 ; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
    464 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    465 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
    466 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
    467 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
    468 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    469 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
    470 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    471 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
    472 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
    473 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
    474 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
    475 ; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
    476 ; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
    477 ; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
    478 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
    479 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    480 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
    481 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
    482 ; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
    483 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    484 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    485 ; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
    486 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
    487 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
    488 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    489 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
    490 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
    491 ; AVX1-NEXT:    retq
    492 ;
    493 ; AVX2-LABEL: testv16i16u:
    494 ; AVX2:       # BB#0:
    495 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    496 ; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    497 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    498 ; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
    499 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    500 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    501 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    502 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    503 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    504 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    505 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    506 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    507 ; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
    508 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    509 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
    510 ; AVX2-NEXT:    retq
    511 ;
    512 ; AVX512CDVL-LABEL: testv16i16u:
    513 ; AVX512CDVL:       # BB#0:
    514 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
    515 ; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    516 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    517 ; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
    518 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    519 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
    520 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    521 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    522 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    523 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    524 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    525 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    526 ; AVX512CDVL-NEXT:    vpsllw $8, %ymm0, %ymm1
    527 ; AVX512CDVL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    528 ; AVX512CDVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
    529 ; AVX512CDVL-NEXT:    retq
    530 ;
    531 ; AVX512CD-LABEL: testv16i16u:
    532 ; AVX512CD:       # BB#0:
    533 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    534 ; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
    535 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    536 ; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
    537 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    538 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
    539 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    540 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    541 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    542 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    543 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    544 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    545 ; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm1
    546 ; AVX512CD-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
    547 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
    548 ; AVX512CD-NEXT:    retq
    549   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
    550   ret <16 x i16> %out
    551 }
    552 
    553 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
    554 ; AVX1-LABEL: testv32i8:
    555 ; AVX1:       # BB#0:
    556 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    557 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    558 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
    559 ; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
    560 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    561 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
    562 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    563 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    564 ; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
    565 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    566 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
    567 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    568 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
    569 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    570 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    571 ; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
    572 ; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
    573 ; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
    574 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
    575 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
    576 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    577 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    578 ; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
    579 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    580 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    581 ; AVX1-NEXT:    retq
    582 ;
    583 ; AVX2-LABEL: testv32i8:
    584 ; AVX2:       # BB#0:
    585 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    586 ; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
    587 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    588 ; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
    589 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    590 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    591 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    592 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    593 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    594 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    595 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    596 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    597 ; AVX2-NEXT:    retq
    598 ;
    599 ; AVX512CDVL-LABEL: testv32i8:
    600 ; AVX512CDVL:       # BB#0:
    601 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
    602 ; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
    603 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    604 ; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
    605 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    606 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
    607 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    608 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    609 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    610 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    611 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    612 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    613 ; AVX512CDVL-NEXT:    retq
    614 ;
    615 ; AVX512CD-LABEL: testv32i8:
    616 ; AVX512CD:       # BB#0:
    617 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    618 ; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
    619 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    620 ; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
    621 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    622 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
    623 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    624 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    625 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    626 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    627 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    628 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    629 ; AVX512CD-NEXT:    retq
    630   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
    631   ret <32 x i8> %out
    632 }
    633 
    634 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
    635 ; AVX1-LABEL: testv32i8u:
    636 ; AVX1:       # BB#0:
    637 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    638 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
    639 ; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
    640 ; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
    641 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
    642 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
    643 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    644 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    645 ; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
    646 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    647 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
    648 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    649 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
    650 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    651 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    652 ; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
    653 ; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
    654 ; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
    655 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
    656 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
    657 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    658 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    659 ; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
    660 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    661 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    662 ; AVX1-NEXT:    retq
    663 ;
    664 ; AVX2-LABEL: testv32i8u:
    665 ; AVX2:       # BB#0:
    666 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    667 ; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
    668 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    669 ; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
    670 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    671 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    672 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    673 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    674 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    675 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    676 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    677 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    678 ; AVX2-NEXT:    retq
    679 ;
    680 ; AVX512CDVL-LABEL: testv32i8u:
    681 ; AVX512CDVL:       # BB#0:
    682 ; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
    683 ; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
    684 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    685 ; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
    686 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    687 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
    688 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    689 ; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    690 ; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
    691 ; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
    692 ; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    693 ; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    694 ; AVX512CDVL-NEXT:    retq
    695 ;
    696 ; AVX512CD-LABEL: testv32i8u:
    697 ; AVX512CD:       # BB#0:
    698 ; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
    699 ; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
    700 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    701 ; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
    702 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    703 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
    704 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    705 ; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    706 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
    707 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
    708 ; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
    709 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
    710 ; AVX512CD-NEXT:    retq
    711   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
    712   ret <32 x i8> %out
    713 }
    714 
    715 define <4 x i64> @foldv4i64() nounwind {
    716 ; AVX1-LABEL: foldv4i64:
    717 ; AVX1:       # BB#0:
    718 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
    719 ; AVX1-NEXT:    retq
    720 ;
    721 ; AVX2-LABEL: foldv4i64:
    722 ; AVX2:       # BB#0:
    723 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
    724 ; AVX2-NEXT:    retq
    725 ;
    726 ; AVX512CDVL-LABEL: foldv4i64:
    727 ; AVX512CDVL:       # BB#0:
    728 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
    729 ; AVX512CDVL-NEXT:    retq
    730 ;
    731 ; AVX512CD-LABEL: foldv4i64:
    732 ; AVX512CD:       # BB#0:
    733 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
    734 ; AVX512CD-NEXT:    retq
    735   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
    736   ret <4 x i64> %out
    737 }
    738 
    739 define <4 x i64> @foldv4i64u() nounwind {
    740 ; AVX1-LABEL: foldv4i64u:
    741 ; AVX1:       # BB#0:
    742 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
    743 ; AVX1-NEXT:    retq
    744 ;
    745 ; AVX2-LABEL: foldv4i64u:
    746 ; AVX2:       # BB#0:
    747 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
    748 ; AVX2-NEXT:    retq
    749 ;
    750 ; AVX512CDVL-LABEL: foldv4i64u:
    751 ; AVX512CDVL:       # BB#0:
    752 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
    753 ; AVX512CDVL-NEXT:    retq
    754 ;
    755 ; AVX512CD-LABEL: foldv4i64u:
    756 ; AVX512CD:       # BB#0:
    757 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
    758 ; AVX512CD-NEXT:    retq
    759   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
    760   ret <4 x i64> %out
    761 }
    762 
    763 define <8 x i32> @foldv8i32() nounwind {
    764 ; AVX1-LABEL: foldv8i32:
    765 ; AVX1:       # BB#0:
    766 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    767 ; AVX1-NEXT:    retq
    768 ;
    769 ; AVX2-LABEL: foldv8i32:
    770 ; AVX2:       # BB#0:
    771 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    772 ; AVX2-NEXT:    retq
    773 ;
    774 ; AVX512CDVL-LABEL: foldv8i32:
    775 ; AVX512CDVL:       # BB#0:
    776 ; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    777 ; AVX512CDVL-NEXT:    retq
    778 ;
    779 ; AVX512CD-LABEL: foldv8i32:
    780 ; AVX512CD:       # BB#0:
    781 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    782 ; AVX512CD-NEXT:    retq
    783   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
    784   ret <8 x i32> %out
    785 }
    786 
    787 define <8 x i32> @foldv8i32u() nounwind {
    788 ; AVX1-LABEL: foldv8i32u:
    789 ; AVX1:       # BB#0:
    790 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    791 ; AVX1-NEXT:    retq
    792 ;
    793 ; AVX2-LABEL: foldv8i32u:
    794 ; AVX2:       # BB#0:
    795 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    796 ; AVX2-NEXT:    retq
    797 ;
    798 ; AVX512CDVL-LABEL: foldv8i32u:
    799 ; AVX512CDVL:       # BB#0:
    800 ; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    801 ; AVX512CDVL-NEXT:    retq
    802 ;
    803 ; AVX512CD-LABEL: foldv8i32u:
    804 ; AVX512CD:       # BB#0:
    805 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
    806 ; AVX512CD-NEXT:    retq
    807   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
    808   ret <8 x i32> %out
    809 }
    810 
    811 define <16 x i16> @foldv16i16() nounwind {
    812 ; AVX1-LABEL: foldv16i16:
    813 ; AVX1:       # BB#0:
    814 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    815 ; AVX1-NEXT:    retq
    816 ;
    817 ; AVX2-LABEL: foldv16i16:
    818 ; AVX2:       # BB#0:
    819 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    820 ; AVX2-NEXT:    retq
    821 ;
    822 ; AVX512CDVL-LABEL: foldv16i16:
    823 ; AVX512CDVL:       # BB#0:
    824 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    825 ; AVX512CDVL-NEXT:    retq
    826 ;
    827 ; AVX512CD-LABEL: foldv16i16:
    828 ; AVX512CD:       # BB#0:
    829 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    830 ; AVX512CD-NEXT:    retq
    831   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
    832   ret <16 x i16> %out
    833 }
    834 
    835 define <16 x i16> @foldv16i16u() nounwind {
    836 ; AVX1-LABEL: foldv16i16u:
    837 ; AVX1:       # BB#0:
    838 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    839 ; AVX1-NEXT:    retq
    840 ;
    841 ; AVX2-LABEL: foldv16i16u:
    842 ; AVX2:       # BB#0:
    843 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    844 ; AVX2-NEXT:    retq
    845 ;
    846 ; AVX512CDVL-LABEL: foldv16i16u:
    847 ; AVX512CDVL:       # BB#0:
    848 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    849 ; AVX512CDVL-NEXT:    retq
    850 ;
    851 ; AVX512CD-LABEL: foldv16i16u:
    852 ; AVX512CD:       # BB#0:
    853 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
    854 ; AVX512CD-NEXT:    retq
    855   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
    856   ret <16 x i16> %out
    857 }
    858 
    859 define <32 x i8> @foldv32i8() nounwind {
    860 ; AVX1-LABEL: foldv32i8:
    861 ; AVX1:       # BB#0:
    862 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    863 ; AVX1-NEXT:    retq
    864 ;
    865 ; AVX2-LABEL: foldv32i8:
    866 ; AVX2:       # BB#0:
    867 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    868 ; AVX2-NEXT:    retq
    869 ;
    870 ; AVX512CDVL-LABEL: foldv32i8:
    871 ; AVX512CDVL:       # BB#0:
    872 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    873 ; AVX512CDVL-NEXT:    retq
    874 ;
    875 ; AVX512CD-LABEL: foldv32i8:
    876 ; AVX512CD:       # BB#0:
    877 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    878 ; AVX512CD-NEXT:    retq
    879   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
    880   ret <32 x i8> %out
    881 }
    882 
    883 define <32 x i8> @foldv32i8u() nounwind {
    884 ; AVX1-LABEL: foldv32i8u:
    885 ; AVX1:       # BB#0:
    886 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    887 ; AVX1-NEXT:    retq
    888 ;
    889 ; AVX2-LABEL: foldv32i8u:
    890 ; AVX2:       # BB#0:
    891 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    892 ; AVX2-NEXT:    retq
    893 ;
    894 ; AVX512CDVL-LABEL: foldv32i8u:
    895 ; AVX512CDVL:       # BB#0:
    896 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    897 ; AVX512CDVL-NEXT:    retq
    898 ;
    899 ; AVX512CD-LABEL: foldv32i8u:
    900 ; AVX512CD:       # BB#0:
    901 ; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
    902 ; AVX512CD-NEXT:    retq
    903   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
    904   ret <32 x i8> %out
    905 }
    906 
    907 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
    908 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
    909 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
    910 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
    911