Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
     10 ;
     11 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
     12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
     13 
     14 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
     15 ; SSE2-LABEL: testv2i64:
     16 ; SSE2:       # BB#0:
     17 ; SSE2-NEXT:    movd %xmm0, %rax
     18 ; SSE2-NEXT:    bsfq %rax, %rax
     19 ; SSE2-NEXT:    movl $64, %ecx
     20 ; SSE2-NEXT:    cmoveq %rcx, %rax
     21 ; SSE2-NEXT:    movd %rax, %xmm1
     22 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     23 ; SSE2-NEXT:    movd %xmm0, %rax
     24 ; SSE2-NEXT:    bsfq %rax, %rax
     25 ; SSE2-NEXT:    cmoveq %rcx, %rax
     26 ; SSE2-NEXT:    movd %rax, %xmm0
     27 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
     28 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
     29 ; SSE2-NEXT:    retq
     30 ;
     31 ; SSE3-LABEL: testv2i64:
     32 ; SSE3:       # BB#0:
     33 ; SSE3-NEXT:    movd %xmm0, %rax
     34 ; SSE3-NEXT:    bsfq %rax, %rax
     35 ; SSE3-NEXT:    movl $64, %ecx
     36 ; SSE3-NEXT:    cmoveq %rcx, %rax
     37 ; SSE3-NEXT:    movd %rax, %xmm1
     38 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     39 ; SSE3-NEXT:    movd %xmm0, %rax
     40 ; SSE3-NEXT:    bsfq %rax, %rax
     41 ; SSE3-NEXT:    cmoveq %rcx, %rax
     42 ; SSE3-NEXT:    movd %rax, %xmm0
     43 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
     44 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
     45 ; SSE3-NEXT:    retq
     46 ;
     47 ; SSSE3-LABEL: testv2i64:
     48 ; SSSE3:       # BB#0:
     49 ; SSSE3-NEXT:    movd %xmm0, %rax
     50 ; SSSE3-NEXT:    bsfq %rax, %rax
     51 ; SSSE3-NEXT:    movl $64, %ecx
     52 ; SSSE3-NEXT:    cmoveq %rcx, %rax
     53 ; SSSE3-NEXT:    movd %rax, %xmm1
     54 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
     55 ; SSSE3-NEXT:    movd %xmm0, %rax
     56 ; SSSE3-NEXT:    bsfq %rax, %rax
     57 ; SSSE3-NEXT:    cmoveq %rcx, %rax
     58 ; SSSE3-NEXT:    movd %rax, %xmm0
     59 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
     60 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
     61 ; SSSE3-NEXT:    retq
     62 ;
     63 ; SSE41-LABEL: testv2i64:
     64 ; SSE41:       # BB#0:
     65 ; SSE41-NEXT:    pextrq $1, %xmm0, %rax
     66 ; SSE41-NEXT:    bsfq %rax, %rax
     67 ; SSE41-NEXT:    movl $64, %ecx
     68 ; SSE41-NEXT:    cmoveq %rcx, %rax
     69 ; SSE41-NEXT:    movd %rax, %xmm1
     70 ; SSE41-NEXT:    movd %xmm0, %rax
     71 ; SSE41-NEXT:    bsfq %rax, %rax
     72 ; SSE41-NEXT:    cmoveq %rcx, %rax
     73 ; SSE41-NEXT:    movd %rax, %xmm0
     74 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     75 ; SSE41-NEXT:    retq
     76 ;
     77 ; AVX-LABEL: testv2i64:
     78 ; AVX:       # BB#0:
     79 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
     80 ; AVX-NEXT:    bsfq %rax, %rax
     81 ; AVX-NEXT:    movl $64, %ecx
     82 ; AVX-NEXT:    cmoveq %rcx, %rax
     83 ; AVX-NEXT:    vmovq %rax, %xmm1
     84 ; AVX-NEXT:    vmovq %xmm0, %rax
     85 ; AVX-NEXT:    bsfq %rax, %rax
     86 ; AVX-NEXT:    cmoveq %rcx, %rax
     87 ; AVX-NEXT:    vmovq %rax, %xmm0
     88 ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
     89 ; AVX-NEXT:    retq
     90 ;
     91 ; X32-SSE-LABEL: testv2i64:
     92 ; X32-SSE:       # BB#0:
     93 ; X32-SSE-NEXT:    pushl %esi
     94 ; X32-SSE-NEXT:    pextrd $3, %xmm0, %eax
     95 ; X32-SSE-NEXT:    bsfl %eax, %eax
     96 ; X32-SSE-NEXT:    movl $32, %ecx
     97 ; X32-SSE-NEXT:    cmovel %ecx, %eax
     98 ; X32-SSE-NEXT:    addl $32, %eax
     99 ; X32-SSE-NEXT:    pextrd $2, %xmm0, %edx
    100 ; X32-SSE-NEXT:    bsfl %edx, %esi
    101 ; X32-SSE-NEXT:    testl %edx, %edx
    102 ; X32-SSE-NEXT:    cmovel %eax, %esi
    103 ; X32-SSE-NEXT:    movd %esi, %xmm1
    104 ; X32-SSE-NEXT:    pextrd $1, %xmm0, %eax
    105 ; X32-SSE-NEXT:    bsfl %eax, %eax
    106 ; X32-SSE-NEXT:    cmovel %ecx, %eax
    107 ; X32-SSE-NEXT:    addl $32, %eax
    108 ; X32-SSE-NEXT:    movd %xmm0, %ecx
    109 ; X32-SSE-NEXT:    bsfl %ecx, %edx
    110 ; X32-SSE-NEXT:    testl %ecx, %ecx
    111 ; X32-SSE-NEXT:    cmovel %eax, %edx
    112 ; X32-SSE-NEXT:    movd %edx, %xmm0
    113 ; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    114 ; X32-SSE-NEXT:    popl %esi
    115 ; X32-SSE-NEXT:    retl
    116   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
    117   ret <2 x i64> %out
    118 }
    119 
    120 define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
    121 ; SSE2-LABEL: testv2i64u:
    122 ; SSE2:       # BB#0:
    123 ; SSE2-NEXT:    movd %xmm0, %rax
    124 ; SSE2-NEXT:    bsfq %rax, %rax
    125 ; SSE2-NEXT:    movd %rax, %xmm1
    126 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    127 ; SSE2-NEXT:    movd %xmm0, %rax
    128 ; SSE2-NEXT:    bsfq %rax, %rax
    129 ; SSE2-NEXT:    movd %rax, %xmm0
    130 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    131 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    132 ; SSE2-NEXT:    retq
    133 ;
    134 ; SSE3-LABEL: testv2i64u:
    135 ; SSE3:       # BB#0:
    136 ; SSE3-NEXT:    movd %xmm0, %rax
    137 ; SSE3-NEXT:    bsfq %rax, %rax
    138 ; SSE3-NEXT:    movd %rax, %xmm1
    139 ; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    140 ; SSE3-NEXT:    movd %xmm0, %rax
    141 ; SSE3-NEXT:    bsfq %rax, %rax
    142 ; SSE3-NEXT:    movd %rax, %xmm0
    143 ; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    144 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    145 ; SSE3-NEXT:    retq
    146 ;
    147 ; SSSE3-LABEL: testv2i64u:
    148 ; SSSE3:       # BB#0:
    149 ; SSSE3-NEXT:    movd %xmm0, %rax
    150 ; SSSE3-NEXT:    bsfq %rax, %rax
    151 ; SSSE3-NEXT:    movd %rax, %xmm1
    152 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    153 ; SSSE3-NEXT:    movd %xmm0, %rax
    154 ; SSSE3-NEXT:    bsfq %rax, %rax
    155 ; SSSE3-NEXT:    movd %rax, %xmm0
    156 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
    157 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    158 ; SSSE3-NEXT:    retq
    159 ;
    160 ; SSE41-LABEL: testv2i64u:
    161 ; SSE41:       # BB#0:
    162 ; SSE41-NEXT:    pextrq $1, %xmm0, %rax
    163 ; SSE41-NEXT:    bsfq %rax, %rax
    164 ; SSE41-NEXT:    movd %rax, %xmm1
    165 ; SSE41-NEXT:    movd %xmm0, %rax
    166 ; SSE41-NEXT:    bsfq %rax, %rax
    167 ; SSE41-NEXT:    movd %rax, %xmm0
    168 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    169 ; SSE41-NEXT:    retq
    170 ;
    171 ; AVX1-LABEL: testv2i64u:
    172 ; AVX1:       # BB#0:
    173 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
    174 ; AVX1-NEXT:    bsfq %rax, %rax
    175 ; AVX1-NEXT:    vmovq %rax, %xmm1
    176 ; AVX1-NEXT:    vmovq %xmm0, %rax
    177 ; AVX1-NEXT:    bsfq %rax, %rax
    178 ; AVX1-NEXT:    vmovq %rax, %xmm0
    179 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    180 ; AVX1-NEXT:    retq
    181 ;
    182 ; AVX2-LABEL: testv2i64u:
    183 ; AVX2:       # BB#0:
    184 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
    185 ; AVX2-NEXT:    bsfq %rax, %rax
    186 ; AVX2-NEXT:    vmovq %rax, %xmm1
    187 ; AVX2-NEXT:    vmovq %xmm0, %rax
    188 ; AVX2-NEXT:    bsfq %rax, %rax
    189 ; AVX2-NEXT:    vmovq %rax, %xmm0
    190 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    191 ; AVX2-NEXT:    retq
    192 ;
    193 ; AVX512CDVL-LABEL: testv2i64u:
    194 ; AVX512CDVL:       # BB#0:
    195 ; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
    196 ; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
    197 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
    198 ; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
    199 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [63,63]
    200 ; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
    201 ; AVX512CDVL-NEXT:    retq
    202 ;
    203 ; AVX512CD-LABEL: testv2i64u:
    204 ; AVX512CD:       # BB#0:
    205 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    206 ; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
    207 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
    208 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
    209 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
    210 ; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
    211 ; AVX512CD-NEXT:    retq
    212 ;
    213 ; X32-SSE-LABEL: testv2i64u:
    214 ; X32-SSE:       # BB#0:
    215 ; X32-SSE-NEXT:    pextrd $2, %xmm0, %eax
    216 ; X32-SSE-NEXT:    bsfl %eax, %ecx
    217 ; X32-SSE-NEXT:    pextrd $3, %xmm0, %edx
    218 ; X32-SSE-NEXT:    bsfl %edx, %edx
    219 ; X32-SSE-NEXT:    addl $32, %edx
    220 ; X32-SSE-NEXT:    testl %eax, %eax
    221 ; X32-SSE-NEXT:    cmovnel %ecx, %edx
    222 ; X32-SSE-NEXT:    movd %edx, %xmm1
    223 ; X32-SSE-NEXT:    movd %xmm0, %eax
    224 ; X32-SSE-NEXT:    bsfl %eax, %ecx
    225 ; X32-SSE-NEXT:    pextrd $1, %xmm0, %edx
    226 ; X32-SSE-NEXT:    bsfl %edx, %edx
    227 ; X32-SSE-NEXT:    addl $32, %edx
    228 ; X32-SSE-NEXT:    testl %eax, %eax
    229 ; X32-SSE-NEXT:    cmovnel %ecx, %edx
    230 ; X32-SSE-NEXT:    movd %edx, %xmm0
    231 ; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
    232 ; X32-SSE-NEXT:    retl
    233   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
    234   ret <2 x i64> %out
    235 }
    236 
    237 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
    238 ; SSE2-LABEL: testv4i32:
    239 ; SSE2:       # BB#0:
    240 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    241 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    242 ; SSE2-NEXT:    psubd %xmm0, %xmm2
    243 ; SSE2-NEXT:    pand %xmm0, %xmm2
    244 ; SSE2-NEXT:    psubd {{.*}}(%rip), %xmm2
    245 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    246 ; SSE2-NEXT:    psrld $1, %xmm0
    247 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    248 ; SSE2-NEXT:    psubd %xmm0, %xmm2
    249 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
    250 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
    251 ; SSE2-NEXT:    pand %xmm0, %xmm3
    252 ; SSE2-NEXT:    psrld $2, %xmm2
    253 ; SSE2-NEXT:    pand %xmm0, %xmm2
    254 ; SSE2-NEXT:    paddd %xmm3, %xmm2
    255 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    256 ; SSE2-NEXT:    psrld $4, %xmm0
    257 ; SSE2-NEXT:    paddd %xmm2, %xmm0
    258 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    259 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    260 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    261 ; SSE2-NEXT:    psadbw %xmm1, %xmm2
    262 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    263 ; SSE2-NEXT:    psadbw %xmm1, %xmm0
    264 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    265 ; SSE2-NEXT:    retq
    266 ;
    267 ; SSE3-LABEL: testv4i32:
    268 ; SSE3:       # BB#0:
    269 ; SSE3-NEXT:    pxor %xmm1, %xmm1
    270 ; SSE3-NEXT:    pxor %xmm2, %xmm2
    271 ; SSE3-NEXT:    psubd %xmm0, %xmm2
    272 ; SSE3-NEXT:    pand %xmm0, %xmm2
    273 ; SSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
    274 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
    275 ; SSE3-NEXT:    psrld $1, %xmm0
    276 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
    277 ; SSE3-NEXT:    psubd %xmm0, %xmm2
    278 ; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
    279 ; SSE3-NEXT:    movdqa %xmm2, %xmm3
    280 ; SSE3-NEXT:    pand %xmm0, %xmm3
    281 ; SSE3-NEXT:    psrld $2, %xmm2
    282 ; SSE3-NEXT:    pand %xmm0, %xmm2
    283 ; SSE3-NEXT:    paddd %xmm3, %xmm2
    284 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
    285 ; SSE3-NEXT:    psrld $4, %xmm0
    286 ; SSE3-NEXT:    paddd %xmm2, %xmm0
    287 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
    288 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    289 ; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    290 ; SSE3-NEXT:    psadbw %xmm1, %xmm2
    291 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    292 ; SSE3-NEXT:    psadbw %xmm1, %xmm0
    293 ; SSE3-NEXT:    packuswb %xmm2, %xmm0
    294 ; SSE3-NEXT:    retq
    295 ;
    296 ; SSSE3-LABEL: testv4i32:
    297 ; SSSE3:       # BB#0:
    298 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    299 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
    300 ; SSSE3-NEXT:    psubd %xmm0, %xmm2
    301 ; SSSE3-NEXT:    pand %xmm0, %xmm2
    302 ; SSSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
    303 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    304 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
    305 ; SSSE3-NEXT:    pand %xmm3, %xmm4
    306 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    307 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
    308 ; SSSE3-NEXT:    pshufb %xmm4, %xmm5
    309 ; SSSE3-NEXT:    psrlw $4, %xmm2
    310 ; SSSE3-NEXT:    pand %xmm3, %xmm2
    311 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
    312 ; SSSE3-NEXT:    paddb %xmm5, %xmm0
    313 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    314 ; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    315 ; SSSE3-NEXT:    psadbw %xmm1, %xmm2
    316 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    317 ; SSSE3-NEXT:    psadbw %xmm1, %xmm0
    318 ; SSSE3-NEXT:    packuswb %xmm2, %xmm0
    319 ; SSSE3-NEXT:    retq
    320 ;
    321 ; SSE41-LABEL: testv4i32:
    322 ; SSE41:       # BB#0:
    323 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    324 ; SSE41-NEXT:    pxor %xmm2, %xmm2
    325 ; SSE41-NEXT:    psubd %xmm0, %xmm2
    326 ; SSE41-NEXT:    pand %xmm0, %xmm2
    327 ; SSE41-NEXT:    psubd {{.*}}(%rip), %xmm2
    328 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    329 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
    330 ; SSE41-NEXT:    pand %xmm3, %xmm4
    331 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    332 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
    333 ; SSE41-NEXT:    pshufb %xmm4, %xmm5
    334 ; SSE41-NEXT:    psrlw $4, %xmm2
    335 ; SSE41-NEXT:    pand %xmm3, %xmm2
    336 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
    337 ; SSE41-NEXT:    paddb %xmm5, %xmm0
    338 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    339 ; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    340 ; SSE41-NEXT:    psadbw %xmm1, %xmm2
    341 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    342 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
    343 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
    344 ; SSE41-NEXT:    retq
    345 ;
    346 ; AVX1-LABEL: testv4i32:
    347 ; AVX1:       # BB#0:
    348 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    349 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
    350 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
    351 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
    352 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    353 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
    354 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    355 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    356 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    357 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
    358 ; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
    359 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    360 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    361 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    362 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    363 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    364 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    365 ; AVX1-NEXT:    retq
    366 ;
    367 ; AVX2-LABEL: testv4i32:
    368 ; AVX2:       # BB#0:
    369 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    370 ; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
    371 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
    372 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
    373 ; AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
    374 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    375 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
    376 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    377 ; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    378 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    379 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
    380 ; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
    381 ; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    382 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    383 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    384 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    385 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    386 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    387 ; AVX2-NEXT:    retq
    388 ;
    389 ; AVX512CDVL-LABEL: testv4i32:
    390 ; AVX512CDVL:       # BB#0:
    391 ; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
    392 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
    393 ; AVX512CDVL-NEXT:    vpandd %xmm2, %xmm0, %xmm0
    394 ; AVX512CDVL-NEXT:    vpsubd {{.*}}(%rip){1to4}, %xmm0, %xmm0
    395 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    396 ; AVX512CDVL-NEXT:    vpandq %xmm2, %xmm0, %xmm3
    397 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    398 ; AVX512CDVL-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    399 ; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
    400 ; AVX512CDVL-NEXT:    vpandq %xmm2, %xmm0, %xmm0
    401 ; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
    402 ; AVX512CDVL-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    403 ; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    404 ; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    405 ; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    406 ; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    407 ; AVX512CDVL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    408 ; AVX512CDVL-NEXT:    retq
    409 ;
    410 ; AVX512CD-LABEL: testv4i32:
    411 ; AVX512CD:       # BB#0:
    412 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    413 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
    414 ; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
    415 ; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
    416 ; AVX512CD-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
    417 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    418 ; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm3
    419 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    420 ; AVX512CD-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    421 ; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
    422 ; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
    423 ; AVX512CD-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
    424 ; AVX512CD-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    425 ; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    426 ; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    427 ; AVX512CD-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    428 ; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    429 ; AVX512CD-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    430 ; AVX512CD-NEXT:    retq
    431 ;
    432 ; X32-SSE-LABEL: testv4i32:
    433 ; X32-SSE:       # BB#0:
    434 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
    435 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
    436 ; X32-SSE-NEXT:    psubd %xmm0, %xmm2
    437 ; X32-SSE-NEXT:    pand %xmm0, %xmm2
    438 ; X32-SSE-NEXT:    psubd {{\.LCPI.*}}, %xmm2
    439 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    440 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
    441 ; X32-SSE-NEXT:    pand %xmm3, %xmm4
    442 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    443 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
    444 ; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
    445 ; X32-SSE-NEXT:    psrlw $4, %xmm2
    446 ; X32-SSE-NEXT:    pand %xmm3, %xmm2
    447 ; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
    448 ; X32-SSE-NEXT:    paddb %xmm5, %xmm0
    449 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
    450 ; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    451 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm2
    452 ; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    453 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
    454 ; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
    455 ; X32-SSE-NEXT:    retl
    456   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
    457   ret <4 x i32> %out
    458 }
    459 
    460 define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
    461 ; SSE2-LABEL: testv4i32u:
    462 ; SSE2:       # BB#0:
    463 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    464 ; SSE2-NEXT:    pxor %xmm2, %xmm2
    465 ; SSE2-NEXT:    psubd %xmm0, %xmm2
    466 ; SSE2-NEXT:    pand %xmm0, %xmm2
    467 ; SSE2-NEXT:    psubd {{.*}}(%rip), %xmm2
    468 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    469 ; SSE2-NEXT:    psrld $1, %xmm0
    470 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    471 ; SSE2-NEXT:    psubd %xmm0, %xmm2
    472 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
    473 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
    474 ; SSE2-NEXT:    pand %xmm0, %xmm3
    475 ; SSE2-NEXT:    psrld $2, %xmm2
    476 ; SSE2-NEXT:    pand %xmm0, %xmm2
    477 ; SSE2-NEXT:    paddd %xmm3, %xmm2
    478 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    479 ; SSE2-NEXT:    psrld $4, %xmm0
    480 ; SSE2-NEXT:    paddd %xmm2, %xmm0
    481 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    482 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    483 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    484 ; SSE2-NEXT:    psadbw %xmm1, %xmm2
    485 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    486 ; SSE2-NEXT:    psadbw %xmm1, %xmm0
    487 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    488 ; SSE2-NEXT:    retq
    489 ;
    490 ; SSE3-LABEL: testv4i32u:
    491 ; SSE3:       # BB#0:
    492 ; SSE3-NEXT:    pxor %xmm1, %xmm1
    493 ; SSE3-NEXT:    pxor %xmm2, %xmm2
    494 ; SSE3-NEXT:    psubd %xmm0, %xmm2
    495 ; SSE3-NEXT:    pand %xmm0, %xmm2
    496 ; SSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
    497 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
    498 ; SSE3-NEXT:    psrld $1, %xmm0
    499 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
    500 ; SSE3-NEXT:    psubd %xmm0, %xmm2
    501 ; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
    502 ; SSE3-NEXT:    movdqa %xmm2, %xmm3
    503 ; SSE3-NEXT:    pand %xmm0, %xmm3
    504 ; SSE3-NEXT:    psrld $2, %xmm2
    505 ; SSE3-NEXT:    pand %xmm0, %xmm2
    506 ; SSE3-NEXT:    paddd %xmm3, %xmm2
    507 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
    508 ; SSE3-NEXT:    psrld $4, %xmm0
    509 ; SSE3-NEXT:    paddd %xmm2, %xmm0
    510 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
    511 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    512 ; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    513 ; SSE3-NEXT:    psadbw %xmm1, %xmm2
    514 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    515 ; SSE3-NEXT:    psadbw %xmm1, %xmm0
    516 ; SSE3-NEXT:    packuswb %xmm2, %xmm0
    517 ; SSE3-NEXT:    retq
    518 ;
    519 ; SSSE3-LABEL: testv4i32u:
    520 ; SSSE3:       # BB#0:
    521 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    522 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
    523 ; SSSE3-NEXT:    psubd %xmm0, %xmm2
    524 ; SSSE3-NEXT:    pand %xmm0, %xmm2
    525 ; SSSE3-NEXT:    psubd {{.*}}(%rip), %xmm2
    526 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    527 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
    528 ; SSSE3-NEXT:    pand %xmm3, %xmm4
    529 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    530 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
    531 ; SSSE3-NEXT:    pshufb %xmm4, %xmm5
    532 ; SSSE3-NEXT:    psrlw $4, %xmm2
    533 ; SSSE3-NEXT:    pand %xmm3, %xmm2
    534 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
    535 ; SSSE3-NEXT:    paddb %xmm5, %xmm0
    536 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    537 ; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    538 ; SSSE3-NEXT:    psadbw %xmm1, %xmm2
    539 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    540 ; SSSE3-NEXT:    psadbw %xmm1, %xmm0
    541 ; SSSE3-NEXT:    packuswb %xmm2, %xmm0
    542 ; SSSE3-NEXT:    retq
    543 ;
    544 ; SSE41-LABEL: testv4i32u:
    545 ; SSE41:       # BB#0:
    546 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    547 ; SSE41-NEXT:    pxor %xmm2, %xmm2
    548 ; SSE41-NEXT:    psubd %xmm0, %xmm2
    549 ; SSE41-NEXT:    pand %xmm0, %xmm2
    550 ; SSE41-NEXT:    psubd {{.*}}(%rip), %xmm2
    551 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    552 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
    553 ; SSE41-NEXT:    pand %xmm3, %xmm4
    554 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    555 ; SSE41-NEXT:    movdqa %xmm0, %xmm5
    556 ; SSE41-NEXT:    pshufb %xmm4, %xmm5
    557 ; SSE41-NEXT:    psrlw $4, %xmm2
    558 ; SSE41-NEXT:    pand %xmm3, %xmm2
    559 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
    560 ; SSE41-NEXT:    paddb %xmm5, %xmm0
    561 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    562 ; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    563 ; SSE41-NEXT:    psadbw %xmm1, %xmm2
    564 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    565 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
    566 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
    567 ; SSE41-NEXT:    retq
    568 ;
    569 ; AVX1-LABEL: testv4i32u:
    570 ; AVX1:       # BB#0:
    571 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    572 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
    573 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
    574 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm0, %xmm0
    575 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    576 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
    577 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    578 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    579 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    580 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
    581 ; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
    582 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    583 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    584 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    585 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    586 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    587 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    588 ; AVX1-NEXT:    retq
    589 ;
    590 ; AVX2-LABEL: testv4i32u:
    591 ; AVX2:       # BB#0:
    592 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    593 ; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
    594 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
    595 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
    596 ; AVX2-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
    597 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    598 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
    599 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    600 ; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    601 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    602 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
    603 ; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
    604 ; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
    605 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    606 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    607 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    608 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    609 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    610 ; AVX2-NEXT:    retq
    611 ;
    612 ; AVX512CDVL-LABEL: testv4i32u:
    613 ; AVX512CDVL:       # BB#0:
    614 ; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
    615 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
    616 ; AVX512CDVL-NEXT:    vpandd %xmm1, %xmm0, %xmm0
    617 ; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
    618 ; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
    619 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
    620 ; AVX512CDVL-NEXT:    retq
    621 ;
    622 ; AVX512CD-LABEL: testv4i32u:
    623 ; AVX512CD:       # BB#0:
    624 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    625 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
    626 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
    627 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
    628 ; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
    629 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
    630 ; AVX512CD-NEXT:    retq
    631 ;
    632 ; X32-SSE-LABEL: testv4i32u:
    633 ; X32-SSE:       # BB#0:
    634 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
    635 ; X32-SSE-NEXT:    pxor %xmm2, %xmm2
    636 ; X32-SSE-NEXT:    psubd %xmm0, %xmm2
    637 ; X32-SSE-NEXT:    pand %xmm0, %xmm2
    638 ; X32-SSE-NEXT:    psubd {{\.LCPI.*}}, %xmm2
    639 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    640 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
    641 ; X32-SSE-NEXT:    pand %xmm3, %xmm4
    642 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    643 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
    644 ; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
    645 ; X32-SSE-NEXT:    psrlw $4, %xmm2
    646 ; X32-SSE-NEXT:    pand %xmm3, %xmm2
    647 ; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
    648 ; X32-SSE-NEXT:    paddb %xmm5, %xmm0
    649 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
    650 ; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    651 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm2
    652 ; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    653 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
    654 ; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
    655 ; X32-SSE-NEXT:    retl
    656   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
    657   ret <4 x i32> %out
    658 }
    659 
    660 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
    661 ; SSE2-LABEL: testv8i16:
    662 ; SSE2:       # BB#0:
    663 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    664 ; SSE2-NEXT:    psubw %xmm0, %xmm1
    665 ; SSE2-NEXT:    pand %xmm0, %xmm1
    666 ; SSE2-NEXT:    psubw {{.*}}(%rip), %xmm1
    667 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    668 ; SSE2-NEXT:    psrlw $1, %xmm0
    669 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    670 ; SSE2-NEXT:    psubw %xmm0, %xmm1
    671 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
    672 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    673 ; SSE2-NEXT:    pand %xmm0, %xmm2
    674 ; SSE2-NEXT:    psrlw $2, %xmm1
    675 ; SSE2-NEXT:    pand %xmm0, %xmm1
    676 ; SSE2-NEXT:    paddw %xmm2, %xmm1
    677 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    678 ; SSE2-NEXT:    psrlw $4, %xmm2
    679 ; SSE2-NEXT:    paddw %xmm1, %xmm2
    680 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
    681 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    682 ; SSE2-NEXT:    psllw $8, %xmm0
    683 ; SSE2-NEXT:    paddb %xmm2, %xmm0
    684 ; SSE2-NEXT:    psrlw $8, %xmm0
    685 ; SSE2-NEXT:    retq
    686 ;
    687 ; SSE3-LABEL: testv8i16:
    688 ; SSE3:       # BB#0:
    689 ; SSE3-NEXT:    pxor %xmm1, %xmm1
    690 ; SSE3-NEXT:    psubw %xmm0, %xmm1
    691 ; SSE3-NEXT:    pand %xmm0, %xmm1
    692 ; SSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
    693 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    694 ; SSE3-NEXT:    psrlw $1, %xmm0
    695 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
    696 ; SSE3-NEXT:    psubw %xmm0, %xmm1
    697 ; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
    698 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
    699 ; SSE3-NEXT:    pand %xmm0, %xmm2
    700 ; SSE3-NEXT:    psrlw $2, %xmm1
    701 ; SSE3-NEXT:    pand %xmm0, %xmm1
    702 ; SSE3-NEXT:    paddw %xmm2, %xmm1
    703 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
    704 ; SSE3-NEXT:    psrlw $4, %xmm2
    705 ; SSE3-NEXT:    paddw %xmm1, %xmm2
    706 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
    707 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
    708 ; SSE3-NEXT:    psllw $8, %xmm0
    709 ; SSE3-NEXT:    paddb %xmm2, %xmm0
    710 ; SSE3-NEXT:    psrlw $8, %xmm0
    711 ; SSE3-NEXT:    retq
    712 ;
    713 ; SSSE3-LABEL: testv8i16:
    714 ; SSSE3:       # BB#0:
    715 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    716 ; SSSE3-NEXT:    psubw %xmm0, %xmm1
    717 ; SSSE3-NEXT:    pand %xmm0, %xmm1
    718 ; SSSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
    719 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    720 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
    721 ; SSSE3-NEXT:    pand %xmm0, %xmm2
    722 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    723 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
    724 ; SSSE3-NEXT:    pshufb %xmm2, %xmm4
    725 ; SSSE3-NEXT:    psrlw $4, %xmm1
    726 ; SSSE3-NEXT:    pand %xmm0, %xmm1
    727 ; SSSE3-NEXT:    pshufb %xmm1, %xmm3
    728 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
    729 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    730 ; SSSE3-NEXT:    psllw $8, %xmm0
    731 ; SSSE3-NEXT:    paddb %xmm3, %xmm0
    732 ; SSSE3-NEXT:    psrlw $8, %xmm0
    733 ; SSSE3-NEXT:    retq
    734 ;
    735 ; SSE41-LABEL: testv8i16:
    736 ; SSE41:       # BB#0:
    737 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    738 ; SSE41-NEXT:    psubw %xmm0, %xmm1
    739 ; SSE41-NEXT:    pand %xmm0, %xmm1
    740 ; SSE41-NEXT:    psubw {{.*}}(%rip), %xmm1
    741 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    742 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
    743 ; SSE41-NEXT:    pand %xmm0, %xmm2
    744 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    745 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
    746 ; SSE41-NEXT:    pshufb %xmm2, %xmm4
    747 ; SSE41-NEXT:    psrlw $4, %xmm1
    748 ; SSE41-NEXT:    pand %xmm0, %xmm1
    749 ; SSE41-NEXT:    pshufb %xmm1, %xmm3
    750 ; SSE41-NEXT:    paddb %xmm4, %xmm3
    751 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    752 ; SSE41-NEXT:    psllw $8, %xmm0
    753 ; SSE41-NEXT:    paddb %xmm3, %xmm0
    754 ; SSE41-NEXT:    psrlw $8, %xmm0
    755 ; SSE41-NEXT:    retq
    756 ;
    757 ; AVX1-LABEL: testv8i16:
    758 ; AVX1:       # BB#0:
    759 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    760 ; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    761 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    762 ; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
    763 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    764 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
    765 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    766 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    767 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    768 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    769 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    770 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    771 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
    772 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    773 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
    774 ; AVX1-NEXT:    retq
    775 ;
    776 ; AVX2-LABEL: testv8i16:
    777 ; AVX2:       # BB#0:
    778 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    779 ; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    780 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    781 ; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
    782 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    783 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
    784 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    785 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    786 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    787 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    788 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    789 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    790 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
    791 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    792 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
    793 ; AVX2-NEXT:    retq
    794 ;
    795 ; AVX512CDVL-LABEL: testv8i16:
    796 ; AVX512CDVL:       # BB#0:
    797 ; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
    798 ; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    799 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
    800 ; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
    801 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    802 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
    803 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    804 ; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    805 ; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
    806 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
    807 ; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    808 ; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    809 ; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
    810 ; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    811 ; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
    812 ; AVX512CDVL-NEXT:    retq
    813 ;
    814 ; AVX512CD-LABEL: testv8i16:
    815 ; AVX512CD:       # BB#0:
    816 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    817 ; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    818 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
    819 ; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
    820 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    821 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
    822 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    823 ; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    824 ; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
    825 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
    826 ; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    827 ; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    828 ; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
    829 ; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    830 ; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
    831 ; AVX512CD-NEXT:    retq
    832 ;
    833 ; X32-SSE-LABEL: testv8i16:
    834 ; X32-SSE:       # BB#0:
    835 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
    836 ; X32-SSE-NEXT:    psubw %xmm0, %xmm1
    837 ; X32-SSE-NEXT:    pand %xmm0, %xmm1
    838 ; X32-SSE-NEXT:    psubw {{\.LCPI.*}}, %xmm1
    839 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    840 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
    841 ; X32-SSE-NEXT:    pand %xmm0, %xmm2
    842 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    843 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
    844 ; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
    845 ; X32-SSE-NEXT:    psrlw $4, %xmm1
    846 ; X32-SSE-NEXT:    pand %xmm0, %xmm1
    847 ; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
    848 ; X32-SSE-NEXT:    paddb %xmm4, %xmm3
    849 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
    850 ; X32-SSE-NEXT:    psllw $8, %xmm0
    851 ; X32-SSE-NEXT:    paddb %xmm3, %xmm0
    852 ; X32-SSE-NEXT:    psrlw $8, %xmm0
    853 ; X32-SSE-NEXT:    retl
    854   %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0)
    855   ret <8 x i16> %out
    856 }
    857 
    858 define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
    859 ; SSE2-LABEL: testv8i16u:
    860 ; SSE2:       # BB#0:
    861 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    862 ; SSE2-NEXT:    psubw %xmm0, %xmm1
    863 ; SSE2-NEXT:    pand %xmm0, %xmm1
    864 ; SSE2-NEXT:    psubw {{.*}}(%rip), %xmm1
    865 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    866 ; SSE2-NEXT:    psrlw $1, %xmm0
    867 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    868 ; SSE2-NEXT:    psubw %xmm0, %xmm1
    869 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
    870 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    871 ; SSE2-NEXT:    pand %xmm0, %xmm2
    872 ; SSE2-NEXT:    psrlw $2, %xmm1
    873 ; SSE2-NEXT:    pand %xmm0, %xmm1
    874 ; SSE2-NEXT:    paddw %xmm2, %xmm1
    875 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    876 ; SSE2-NEXT:    psrlw $4, %xmm2
    877 ; SSE2-NEXT:    paddw %xmm1, %xmm2
    878 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
    879 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
    880 ; SSE2-NEXT:    psllw $8, %xmm0
    881 ; SSE2-NEXT:    paddb %xmm2, %xmm0
    882 ; SSE2-NEXT:    psrlw $8, %xmm0
    883 ; SSE2-NEXT:    retq
    884 ;
    885 ; SSE3-LABEL: testv8i16u:
    886 ; SSE3:       # BB#0:
    887 ; SSE3-NEXT:    pxor %xmm1, %xmm1
    888 ; SSE3-NEXT:    psubw %xmm0, %xmm1
    889 ; SSE3-NEXT:    pand %xmm0, %xmm1
    890 ; SSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
    891 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    892 ; SSE3-NEXT:    psrlw $1, %xmm0
    893 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
    894 ; SSE3-NEXT:    psubw %xmm0, %xmm1
    895 ; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
    896 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
    897 ; SSE3-NEXT:    pand %xmm0, %xmm2
    898 ; SSE3-NEXT:    psrlw $2, %xmm1
    899 ; SSE3-NEXT:    pand %xmm0, %xmm1
    900 ; SSE3-NEXT:    paddw %xmm2, %xmm1
    901 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
    902 ; SSE3-NEXT:    psrlw $4, %xmm2
    903 ; SSE3-NEXT:    paddw %xmm1, %xmm2
    904 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
    905 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
    906 ; SSE3-NEXT:    psllw $8, %xmm0
    907 ; SSE3-NEXT:    paddb %xmm2, %xmm0
    908 ; SSE3-NEXT:    psrlw $8, %xmm0
    909 ; SSE3-NEXT:    retq
    910 ;
    911 ; SSSE3-LABEL: testv8i16u:
    912 ; SSSE3:       # BB#0:
    913 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
    914 ; SSSE3-NEXT:    psubw %xmm0, %xmm1
    915 ; SSSE3-NEXT:    pand %xmm0, %xmm1
    916 ; SSSE3-NEXT:    psubw {{.*}}(%rip), %xmm1
    917 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    918 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
    919 ; SSSE3-NEXT:    pand %xmm0, %xmm2
    920 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    921 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
    922 ; SSSE3-NEXT:    pshufb %xmm2, %xmm4
    923 ; SSSE3-NEXT:    psrlw $4, %xmm1
    924 ; SSSE3-NEXT:    pand %xmm0, %xmm1
    925 ; SSSE3-NEXT:    pshufb %xmm1, %xmm3
    926 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
    927 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    928 ; SSSE3-NEXT:    psllw $8, %xmm0
    929 ; SSSE3-NEXT:    paddb %xmm3, %xmm0
    930 ; SSSE3-NEXT:    psrlw $8, %xmm0
    931 ; SSSE3-NEXT:    retq
    932 ;
    933 ; SSE41-LABEL: testv8i16u:
    934 ; SSE41:       # BB#0:
    935 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    936 ; SSE41-NEXT:    psubw %xmm0, %xmm1
    937 ; SSE41-NEXT:    pand %xmm0, %xmm1
    938 ; SSE41-NEXT:    psubw {{.*}}(%rip), %xmm1
    939 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    940 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
    941 ; SSE41-NEXT:    pand %xmm0, %xmm2
    942 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    943 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
    944 ; SSE41-NEXT:    pshufb %xmm2, %xmm4
    945 ; SSE41-NEXT:    psrlw $4, %xmm1
    946 ; SSE41-NEXT:    pand %xmm0, %xmm1
    947 ; SSE41-NEXT:    pshufb %xmm1, %xmm3
    948 ; SSE41-NEXT:    paddb %xmm4, %xmm3
    949 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    950 ; SSE41-NEXT:    psllw $8, %xmm0
    951 ; SSE41-NEXT:    paddb %xmm3, %xmm0
    952 ; SSE41-NEXT:    psrlw $8, %xmm0
    953 ; SSE41-NEXT:    retq
    954 ;
    955 ; AVX1-LABEL: testv8i16u:
    956 ; AVX1:       # BB#0:
    957 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    958 ; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    959 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    960 ; AVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
    961 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    962 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
    963 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    964 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    965 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    966 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    967 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    968 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    969 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
    970 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    971 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
    972 ; AVX1-NEXT:    retq
    973 ;
    974 ; AVX2-LABEL: testv8i16u:
    975 ; AVX2:       # BB#0:
    976 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    977 ; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    978 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    979 ; AVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
    980 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    981 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
    982 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    983 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    984 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    985 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    986 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    987 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    988 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
    989 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    990 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
    991 ; AVX2-NEXT:    retq
    992 ;
    993 ; AVX512CDVL-LABEL: testv8i16u:
    994 ; AVX512CDVL:       # BB#0:
    995 ; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
    996 ; AVX512CDVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
    997 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
    998 ; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
    999 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1000 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
   1001 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1002 ; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1003 ; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1004 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
   1005 ; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1006 ; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1007 ; AVX512CDVL-NEXT:    vpsllw $8, %xmm0, %xmm1
   1008 ; AVX512CDVL-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
   1009 ; AVX512CDVL-NEXT:    vpsrlw $8, %xmm0, %xmm0
   1010 ; AVX512CDVL-NEXT:    retq
   1011 ;
   1012 ; AVX512CD-LABEL: testv8i16u:
   1013 ; AVX512CD:       # BB#0:
   1014 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1015 ; AVX512CD-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
   1016 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1017 ; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
   1018 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1019 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1020 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1021 ; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1022 ; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1023 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1024 ; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1025 ; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1026 ; AVX512CD-NEXT:    vpsllw $8, %xmm0, %xmm1
   1027 ; AVX512CD-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
   1028 ; AVX512CD-NEXT:    vpsrlw $8, %xmm0, %xmm0
   1029 ; AVX512CD-NEXT:    retq
   1030 ;
   1031 ; X32-SSE-LABEL: testv8i16u:
   1032 ; X32-SSE:       # BB#0:
   1033 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
   1034 ; X32-SSE-NEXT:    psubw %xmm0, %xmm1
   1035 ; X32-SSE-NEXT:    pand %xmm0, %xmm1
   1036 ; X32-SSE-NEXT:    psubw {{\.LCPI.*}}, %xmm1
   1037 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1038 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
   1039 ; X32-SSE-NEXT:    pand %xmm0, %xmm2
   1040 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1041 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
   1042 ; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
   1043 ; X32-SSE-NEXT:    psrlw $4, %xmm1
   1044 ; X32-SSE-NEXT:    pand %xmm0, %xmm1
   1045 ; X32-SSE-NEXT:    pshufb %xmm1, %xmm3
   1046 ; X32-SSE-NEXT:    paddb %xmm4, %xmm3
   1047 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm0
   1048 ; X32-SSE-NEXT:    psllw $8, %xmm0
   1049 ; X32-SSE-NEXT:    paddb %xmm3, %xmm0
   1050 ; X32-SSE-NEXT:    psrlw $8, %xmm0
   1051 ; X32-SSE-NEXT:    retl
   1052   %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1)
   1053   ret <8 x i16> %out
   1054 }
   1055 
   1056 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
   1057 ; SSE2-LABEL: testv16i8:
   1058 ; SSE2:       # BB#0:
   1059 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1060 ; SSE2-NEXT:    psubb %xmm0, %xmm1
   1061 ; SSE2-NEXT:    pand %xmm0, %xmm1
   1062 ; SSE2-NEXT:    psubb {{.*}}(%rip), %xmm1
   1063 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1064 ; SSE2-NEXT:    psrlw $1, %xmm0
   1065 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1066 ; SSE2-NEXT:    psubb %xmm0, %xmm1
   1067 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1068 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   1069 ; SSE2-NEXT:    pand %xmm0, %xmm2
   1070 ; SSE2-NEXT:    psrlw $2, %xmm1
   1071 ; SSE2-NEXT:    pand %xmm0, %xmm1
   1072 ; SSE2-NEXT:    paddb %xmm2, %xmm1
   1073 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1074 ; SSE2-NEXT:    psrlw $4, %xmm0
   1075 ; SSE2-NEXT:    paddb %xmm1, %xmm0
   1076 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1077 ; SSE2-NEXT:    retq
   1078 ;
   1079 ; SSE3-LABEL: testv16i8:
   1080 ; SSE3:       # BB#0:
   1081 ; SSE3-NEXT:    pxor %xmm1, %xmm1
   1082 ; SSE3-NEXT:    psubb %xmm0, %xmm1
   1083 ; SSE3-NEXT:    pand %xmm0, %xmm1
   1084 ; SSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
   1085 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
   1086 ; SSE3-NEXT:    psrlw $1, %xmm0
   1087 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
   1088 ; SSE3-NEXT:    psubb %xmm0, %xmm1
   1089 ; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1090 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
   1091 ; SSE3-NEXT:    pand %xmm0, %xmm2
   1092 ; SSE3-NEXT:    psrlw $2, %xmm1
   1093 ; SSE3-NEXT:    pand %xmm0, %xmm1
   1094 ; SSE3-NEXT:    paddb %xmm2, %xmm1
   1095 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
   1096 ; SSE3-NEXT:    psrlw $4, %xmm0
   1097 ; SSE3-NEXT:    paddb %xmm1, %xmm0
   1098 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
   1099 ; SSE3-NEXT:    retq
   1100 ;
   1101 ; SSSE3-LABEL: testv16i8:
   1102 ; SSSE3:       # BB#0:
   1103 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
   1104 ; SSSE3-NEXT:    psubb %xmm0, %xmm1
   1105 ; SSSE3-NEXT:    pand %xmm0, %xmm1
   1106 ; SSSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
   1107 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1108 ; SSSE3-NEXT:    movdqa %xmm1, %xmm3
   1109 ; SSSE3-NEXT:    pand %xmm2, %xmm3
   1110 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1111 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
   1112 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   1113 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1114 ; SSSE3-NEXT:    pand %xmm2, %xmm1
   1115 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   1116 ; SSSE3-NEXT:    paddb %xmm4, %xmm0
   1117 ; SSSE3-NEXT:    retq
   1118 ;
   1119 ; SSE41-LABEL: testv16i8:
   1120 ; SSE41:       # BB#0:
   1121 ; SSE41-NEXT:    pxor %xmm1, %xmm1
   1122 ; SSE41-NEXT:    psubb %xmm0, %xmm1
   1123 ; SSE41-NEXT:    pand %xmm0, %xmm1
   1124 ; SSE41-NEXT:    psubb {{.*}}(%rip), %xmm1
   1125 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1126 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
   1127 ; SSE41-NEXT:    pand %xmm2, %xmm3
   1128 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1129 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
   1130 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
   1131 ; SSE41-NEXT:    psrlw $4, %xmm1
   1132 ; SSE41-NEXT:    pand %xmm2, %xmm1
   1133 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
   1134 ; SSE41-NEXT:    paddb %xmm4, %xmm0
   1135 ; SSE41-NEXT:    retq
   1136 ;
   1137 ; AVX1-LABEL: testv16i8:
   1138 ; AVX1:       # BB#0:
   1139 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1140 ; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1141 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1142 ; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1143 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1144 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1145 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1146 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1147 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1148 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1149 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1150 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1151 ; AVX1-NEXT:    retq
   1152 ;
   1153 ; AVX2-LABEL: testv16i8:
   1154 ; AVX2:       # BB#0:
   1155 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1156 ; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1157 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1158 ; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1159 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1160 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1161 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1162 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1163 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1164 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1165 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1166 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1167 ; AVX2-NEXT:    retq
   1168 ;
   1169 ; AVX512CDVL-LABEL: testv16i8:
   1170 ; AVX512CDVL:       # BB#0:
   1171 ; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
   1172 ; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1173 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
   1174 ; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1175 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1176 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
   1177 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1178 ; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1179 ; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1180 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
   1181 ; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1182 ; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1183 ; AVX512CDVL-NEXT:    retq
   1184 ;
   1185 ; AVX512CD-LABEL: testv16i8:
   1186 ; AVX512CD:       # BB#0:
   1187 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1188 ; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1189 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1190 ; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1191 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1192 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1193 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1194 ; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1195 ; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1196 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1197 ; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1198 ; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1199 ; AVX512CD-NEXT:    retq
   1200 ;
   1201 ; X32-SSE-LABEL: testv16i8:
   1202 ; X32-SSE:       # BB#0:
   1203 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
   1204 ; X32-SSE-NEXT:    psubb %xmm0, %xmm1
   1205 ; X32-SSE-NEXT:    pand %xmm0, %xmm1
   1206 ; X32-SSE-NEXT:    psubb {{\.LCPI.*}}, %xmm1
   1207 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1208 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
   1209 ; X32-SSE-NEXT:    pand %xmm2, %xmm3
   1210 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1211 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
   1212 ; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
   1213 ; X32-SSE-NEXT:    psrlw $4, %xmm1
   1214 ; X32-SSE-NEXT:    pand %xmm2, %xmm1
   1215 ; X32-SSE-NEXT:    pshufb %xmm1, %xmm0
   1216 ; X32-SSE-NEXT:    paddb %xmm4, %xmm0
   1217 ; X32-SSE-NEXT:    retl
   1218   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
   1219   ret <16 x i8> %out
   1220 }
   1221 
   1222 define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
   1223 ; SSE2-LABEL: testv16i8u:
   1224 ; SSE2:       # BB#0:
   1225 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1226 ; SSE2-NEXT:    psubb %xmm0, %xmm1
   1227 ; SSE2-NEXT:    pand %xmm0, %xmm1
   1228 ; SSE2-NEXT:    psubb {{.*}}(%rip), %xmm1
   1229 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1230 ; SSE2-NEXT:    psrlw $1, %xmm0
   1231 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1232 ; SSE2-NEXT:    psubb %xmm0, %xmm1
   1233 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1234 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   1235 ; SSE2-NEXT:    pand %xmm0, %xmm2
   1236 ; SSE2-NEXT:    psrlw $2, %xmm1
   1237 ; SSE2-NEXT:    pand %xmm0, %xmm1
   1238 ; SSE2-NEXT:    paddb %xmm2, %xmm1
   1239 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1240 ; SSE2-NEXT:    psrlw $4, %xmm0
   1241 ; SSE2-NEXT:    paddb %xmm1, %xmm0
   1242 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1243 ; SSE2-NEXT:    retq
   1244 ;
   1245 ; SSE3-LABEL: testv16i8u:
   1246 ; SSE3:       # BB#0:
   1247 ; SSE3-NEXT:    pxor %xmm1, %xmm1
   1248 ; SSE3-NEXT:    psubb %xmm0, %xmm1
   1249 ; SSE3-NEXT:    pand %xmm0, %xmm1
   1250 ; SSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
   1251 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
   1252 ; SSE3-NEXT:    psrlw $1, %xmm0
   1253 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
   1254 ; SSE3-NEXT:    psubb %xmm0, %xmm1
   1255 ; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1256 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
   1257 ; SSE3-NEXT:    pand %xmm0, %xmm2
   1258 ; SSE3-NEXT:    psrlw $2, %xmm1
   1259 ; SSE3-NEXT:    pand %xmm0, %xmm1
   1260 ; SSE3-NEXT:    paddb %xmm2, %xmm1
   1261 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
   1262 ; SSE3-NEXT:    psrlw $4, %xmm0
   1263 ; SSE3-NEXT:    paddb %xmm1, %xmm0
   1264 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
   1265 ; SSE3-NEXT:    retq
   1266 ;
   1267 ; SSSE3-LABEL: testv16i8u:
   1268 ; SSSE3:       # BB#0:
   1269 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
   1270 ; SSSE3-NEXT:    psubb %xmm0, %xmm1
   1271 ; SSSE3-NEXT:    pand %xmm0, %xmm1
   1272 ; SSSE3-NEXT:    psubb {{.*}}(%rip), %xmm1
   1273 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1274 ; SSSE3-NEXT:    movdqa %xmm1, %xmm3
   1275 ; SSSE3-NEXT:    pand %xmm2, %xmm3
   1276 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1277 ; SSSE3-NEXT:    movdqa %xmm0, %xmm4
   1278 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   1279 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1280 ; SSSE3-NEXT:    pand %xmm2, %xmm1
   1281 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   1282 ; SSSE3-NEXT:    paddb %xmm4, %xmm0
   1283 ; SSSE3-NEXT:    retq
   1284 ;
   1285 ; SSE41-LABEL: testv16i8u:
   1286 ; SSE41:       # BB#0:
   1287 ; SSE41-NEXT:    pxor %xmm1, %xmm1
   1288 ; SSE41-NEXT:    psubb %xmm0, %xmm1
   1289 ; SSE41-NEXT:    pand %xmm0, %xmm1
   1290 ; SSE41-NEXT:    psubb {{.*}}(%rip), %xmm1
   1291 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1292 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
   1293 ; SSE41-NEXT:    pand %xmm2, %xmm3
   1294 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1295 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
   1296 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
   1297 ; SSE41-NEXT:    psrlw $4, %xmm1
   1298 ; SSE41-NEXT:    pand %xmm2, %xmm1
   1299 ; SSE41-NEXT:    pshufb %xmm1, %xmm0
   1300 ; SSE41-NEXT:    paddb %xmm4, %xmm0
   1301 ; SSE41-NEXT:    retq
   1302 ;
   1303 ; AVX1-LABEL: testv16i8u:
   1304 ; AVX1:       # BB#0:
   1305 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1306 ; AVX1-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1307 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1308 ; AVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1309 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1310 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1311 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1312 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1313 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1314 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1315 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1316 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1317 ; AVX1-NEXT:    retq
   1318 ;
   1319 ; AVX2-LABEL: testv16i8u:
   1320 ; AVX2:       # BB#0:
   1321 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1322 ; AVX2-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1323 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1324 ; AVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1325 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1326 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1327 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1328 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1329 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1330 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1331 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1332 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1333 ; AVX2-NEXT:    retq
   1334 ;
   1335 ; AVX512CDVL-LABEL: testv16i8u:
   1336 ; AVX512CDVL:       # BB#0:
   1337 ; AVX512CDVL-NEXT:    vpxord %xmm1, %xmm1, %xmm1
   1338 ; AVX512CDVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1339 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
   1340 ; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1341 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1342 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm2
   1343 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1344 ; AVX512CDVL-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1345 ; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1346 ; AVX512CDVL-NEXT:    vpandq %xmm1, %xmm0, %xmm0
   1347 ; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1348 ; AVX512CDVL-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1349 ; AVX512CDVL-NEXT:    retq
   1350 ;
   1351 ; AVX512CD-LABEL: testv16i8u:
   1352 ; AVX512CD:       # BB#0:
   1353 ; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1354 ; AVX512CD-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
   1355 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1356 ; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
   1357 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1358 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1359 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1360 ; AVX512CD-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1361 ; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1362 ; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1363 ; AVX512CD-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
   1364 ; AVX512CD-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
   1365 ; AVX512CD-NEXT:    retq
   1366 ;
   1367 ; X32-SSE-LABEL: testv16i8u:
   1368 ; X32-SSE:       # BB#0:
   1369 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
   1370 ; X32-SSE-NEXT:    psubb %xmm0, %xmm1
   1371 ; X32-SSE-NEXT:    pand %xmm0, %xmm1
   1372 ; X32-SSE-NEXT:    psubb {{\.LCPI.*}}, %xmm1
   1373 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1374 ; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
   1375 ; X32-SSE-NEXT:    pand %xmm2, %xmm3
   1376 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
   1377 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
   1378 ; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
   1379 ; X32-SSE-NEXT:    psrlw $4, %xmm1
   1380 ; X32-SSE-NEXT:    pand %xmm2, %xmm1
   1381 ; X32-SSE-NEXT:    pshufb %xmm1, %xmm0
   1382 ; X32-SSE-NEXT:    paddb %xmm4, %xmm0
   1383 ; X32-SSE-NEXT:    retl
   1384   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
   1385   ret <16 x i8> %out
   1386 }
   1387 
   1388 define <2 x i64> @foldv2i64() nounwind {
   1389 ; SSE-LABEL: foldv2i64:
   1390 ; SSE:       # BB#0:
   1391 ; SSE-NEXT:    movl $8, %eax
   1392 ; SSE-NEXT:    movd %rax, %xmm0
   1393 ; SSE-NEXT:    retq
   1394 ;
   1395 ; AVX-LABEL: foldv2i64:
   1396 ; AVX:       # BB#0:
   1397 ; AVX-NEXT:    movl $8, %eax
   1398 ; AVX-NEXT:    vmovq %rax, %xmm0
   1399 ; AVX-NEXT:    retq
   1400 ;
   1401 ; X32-SSE-LABEL: foldv2i64:
   1402 ; X32-SSE:       # BB#0:
   1403 ; X32-SSE-NEXT:    movl $8, %eax
   1404 ; X32-SSE-NEXT:    movd %eax, %xmm0
   1405 ; X32-SSE-NEXT:    retl
   1406   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
   1407   ret <2 x i64> %out
   1408 }
   1409 
   1410 define <2 x i64> @foldv2i64u() nounwind {
   1411 ; SSE-LABEL: foldv2i64u:
   1412 ; SSE:       # BB#0:
   1413 ; SSE-NEXT:    movl $8, %eax
   1414 ; SSE-NEXT:    movd %rax, %xmm0
   1415 ; SSE-NEXT:    retq
   1416 ;
   1417 ; AVX-LABEL: foldv2i64u:
   1418 ; AVX:       # BB#0:
   1419 ; AVX-NEXT:    movl $8, %eax
   1420 ; AVX-NEXT:    vmovq %rax, %xmm0
   1421 ; AVX-NEXT:    retq
   1422 ;
   1423 ; X32-SSE-LABEL: foldv2i64u:
   1424 ; X32-SSE:       # BB#0:
   1425 ; X32-SSE-NEXT:    movl $8, %eax
   1426 ; X32-SSE-NEXT:    movd %eax, %xmm0
   1427 ; X32-SSE-NEXT:    retl
   1428   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
   1429   ret <2 x i64> %out
   1430 }
   1431 
   1432 define <4 x i32> @foldv4i32() nounwind {
   1433 ; SSE-LABEL: foldv4i32:
   1434 ; SSE:       # BB#0:
   1435 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
   1436 ; SSE-NEXT:    retq
   1437 ;
   1438 ; AVX1-LABEL: foldv4i32:
   1439 ; AVX1:       # BB#0:
   1440 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
   1441 ; AVX1-NEXT:    retq
   1442 ;
   1443 ; AVX2-LABEL: foldv4i32:
   1444 ; AVX2:       # BB#0:
   1445 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
   1446 ; AVX2-NEXT:    retq
   1447 ;
   1448 ; AVX512CDVL-LABEL: foldv4i32:
   1449 ; AVX512CDVL:       # BB#0:
   1450 ; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
   1451 ; AVX512CDVL-NEXT:    retq
   1452 ;
   1453 ; AVX512CD-LABEL: foldv4i32:
   1454 ; AVX512CD:       # BB#0:
   1455 ; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
   1456 ; AVX512CD-NEXT:    retq
   1457 ;
   1458 ; X32-SSE-LABEL: foldv4i32:
   1459 ; X32-SSE:       # BB#0:
   1460 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
   1461 ; X32-SSE-NEXT:    retl
   1462   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
   1463   ret <4 x i32> %out
   1464 }
   1465 
   1466 define <4 x i32> @foldv4i32u() nounwind {
   1467 ; SSE-LABEL: foldv4i32u:
   1468 ; SSE:       # BB#0:
   1469 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
   1470 ; SSE-NEXT:    retq
   1471 ;
   1472 ; AVX1-LABEL: foldv4i32u:
   1473 ; AVX1:       # BB#0:
   1474 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
   1475 ; AVX1-NEXT:    retq
   1476 ;
   1477 ; AVX2-LABEL: foldv4i32u:
   1478 ; AVX2:       # BB#0:
   1479 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
   1480 ; AVX2-NEXT:    retq
   1481 ;
   1482 ; AVX512CDVL-LABEL: foldv4i32u:
   1483 ; AVX512CDVL:       # BB#0:
   1484 ; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0]
   1485 ; AVX512CDVL-NEXT:    retq
   1486 ;
   1487 ; AVX512CD-LABEL: foldv4i32u:
   1488 ; AVX512CD:       # BB#0:
   1489 ; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
   1490 ; AVX512CD-NEXT:    retq
   1491 ;
   1492 ; X32-SSE-LABEL: foldv4i32u:
   1493 ; X32-SSE:       # BB#0:
   1494 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
   1495 ; X32-SSE-NEXT:    retl
   1496   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
   1497   ret <4 x i32> %out
   1498 }
   1499 
   1500 define <8 x i16> @foldv8i16() nounwind {
   1501 ; SSE-LABEL: foldv8i16:
   1502 ; SSE:       # BB#0:
   1503 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1504 ; SSE-NEXT:    retq
   1505 ;
   1506 ; AVX1-LABEL: foldv8i16:
   1507 ; AVX1:       # BB#0:
   1508 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1509 ; AVX1-NEXT:    retq
   1510 ;
   1511 ; AVX2-LABEL: foldv8i16:
   1512 ; AVX2:       # BB#0:
   1513 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1514 ; AVX2-NEXT:    retq
   1515 ;
   1516 ; AVX512CDVL-LABEL: foldv8i16:
   1517 ; AVX512CDVL:       # BB#0:
   1518 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1519 ; AVX512CDVL-NEXT:    retq
   1520 ;
   1521 ; AVX512CD-LABEL: foldv8i16:
   1522 ; AVX512CD:       # BB#0:
   1523 ; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1524 ; AVX512CD-NEXT:    retq
   1525 ;
   1526 ; X32-SSE-LABEL: foldv8i16:
   1527 ; X32-SSE:       # BB#0:
   1528 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1529 ; X32-SSE-NEXT:    retl
   1530   %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
   1531   ret <8 x i16> %out
   1532 }
   1533 
   1534 define <8 x i16> @foldv8i16u() nounwind {
   1535 ; SSE-LABEL: foldv8i16u:
   1536 ; SSE:       # BB#0:
   1537 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1538 ; SSE-NEXT:    retq
   1539 ;
   1540 ; AVX1-LABEL: foldv8i16u:
   1541 ; AVX1:       # BB#0:
   1542 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1543 ; AVX1-NEXT:    retq
   1544 ;
   1545 ; AVX2-LABEL: foldv8i16u:
   1546 ; AVX2:       # BB#0:
   1547 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1548 ; AVX2-NEXT:    retq
   1549 ;
   1550 ; AVX512CDVL-LABEL: foldv8i16u:
   1551 ; AVX512CDVL:       # BB#0:
   1552 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1553 ; AVX512CDVL-NEXT:    retq
   1554 ;
   1555 ; AVX512CD-LABEL: foldv8i16u:
   1556 ; AVX512CD:       # BB#0:
   1557 ; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1558 ; AVX512CD-NEXT:    retq
   1559 ;
   1560 ; X32-SSE-LABEL: foldv8i16u:
   1561 ; X32-SSE:       # BB#0:
   1562 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
   1563 ; X32-SSE-NEXT:    retl
   1564   %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
   1565   ret <8 x i16> %out
   1566 }
   1567 
   1568 define <16 x i8> @foldv16i8() nounwind {
   1569 ; SSE-LABEL: foldv16i8:
   1570 ; SSE:       # BB#0:
   1571 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1572 ; SSE-NEXT:    retq
   1573 ;
   1574 ; AVX1-LABEL: foldv16i8:
   1575 ; AVX1:       # BB#0:
   1576 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1577 ; AVX1-NEXT:    retq
   1578 ;
   1579 ; AVX2-LABEL: foldv16i8:
   1580 ; AVX2:       # BB#0:
   1581 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1582 ; AVX2-NEXT:    retq
   1583 ;
   1584 ; AVX512CDVL-LABEL: foldv16i8:
   1585 ; AVX512CDVL:       # BB#0:
   1586 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1587 ; AVX512CDVL-NEXT:    retq
   1588 ;
   1589 ; AVX512CD-LABEL: foldv16i8:
   1590 ; AVX512CD:       # BB#0:
   1591 ; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1592 ; AVX512CD-NEXT:    retq
   1593 ;
   1594 ; X32-SSE-LABEL: foldv16i8:
   1595 ; X32-SSE:       # BB#0:
   1596 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1597 ; X32-SSE-NEXT:    retl
   1598   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
   1599   ret <16 x i8> %out
   1600 }
   1601 
   1602 define <16 x i8> @foldv16i8u() nounwind {
   1603 ; SSE-LABEL: foldv16i8u:
   1604 ; SSE:       # BB#0:
   1605 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1606 ; SSE-NEXT:    retq
   1607 ;
   1608 ; AVX1-LABEL: foldv16i8u:
   1609 ; AVX1:       # BB#0:
   1610 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1611 ; AVX1-NEXT:    retq
   1612 ;
   1613 ; AVX2-LABEL: foldv16i8u:
   1614 ; AVX2:       # BB#0:
   1615 ; AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1616 ; AVX2-NEXT:    retq
   1617 ;
   1618 ; AVX512CDVL-LABEL: foldv16i8u:
   1619 ; AVX512CDVL:       # BB#0:
   1620 ; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1621 ; AVX512CDVL-NEXT:    retq
   1622 ;
   1623 ; AVX512CD-LABEL: foldv16i8u:
   1624 ; AVX512CD:       # BB#0:
   1625 ; AVX512CD-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1626 ; AVX512CD-NEXT:    retq
   1627 ;
   1628 ; X32-SSE-LABEL: foldv16i8u:
   1629 ; X32-SSE:       # BB#0:
   1630 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
   1631 ; X32-SSE-NEXT:    retl
   1632   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
   1633   ret <16 x i8> %out
   1634 }
   1635 
   1636 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
   1637 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1)
   1638 declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1)
   1639 declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1)
   1640