1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD 10 ; 11 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. 12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 13 14 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 15 ; SSE2-LABEL: testv2i64: 16 ; SSE2: # BB#0: 17 ; SSE2-NEXT: movd %xmm0, %rax 18 ; SSE2-NEXT: bsfq %rax, %rax 19 ; SSE2-NEXT: movl $64, %ecx 20 ; SSE2-NEXT: cmoveq %rcx, %rax 21 ; SSE2-NEXT: movd %rax, %xmm1 22 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 23 ; SSE2-NEXT: movd %xmm0, %rax 24 ; SSE2-NEXT: bsfq %rax, %rax 25 ; SSE2-NEXT: cmoveq %rcx, %rax 26 ; SSE2-NEXT: movd %rax, %xmm0 27 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 28 ; SSE2-NEXT: movdqa %xmm1, %xmm0 29 ; SSE2-NEXT: retq 30 ; 31 ; SSE3-LABEL: testv2i64: 32 ; SSE3: # BB#0: 33 ; SSE3-NEXT: movd %xmm0, %rax 34 ; SSE3-NEXT: bsfq %rax, %rax 35 ; SSE3-NEXT: movl $64, %ecx 36 ; SSE3-NEXT: cmoveq %rcx, %rax 37 ; SSE3-NEXT: movd %rax, %xmm1 38 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 39 ; SSE3-NEXT: movd %xmm0, %rax 40 ; SSE3-NEXT: bsfq %rax, %rax 41 ; SSE3-NEXT: cmoveq %rcx, %rax 42 ; SSE3-NEXT: movd %rax, %xmm0 43 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 44 ; SSE3-NEXT: movdqa %xmm1, %xmm0 45 ; SSE3-NEXT: retq 46 ; 47 ; SSSE3-LABEL: testv2i64: 48 ; SSSE3: # BB#0: 49 ; SSSE3-NEXT: movd %xmm0, %rax 50 ; SSSE3-NEXT: bsfq %rax, %rax 51 ; SSSE3-NEXT: movl $64, %ecx 52 ; SSSE3-NEXT: cmoveq %rcx, %rax 53 ; SSSE3-NEXT: movd %rax, %xmm1 54 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 55 ; SSSE3-NEXT: movd %xmm0, %rax 56 ; SSSE3-NEXT: bsfq %rax, %rax 57 ; SSSE3-NEXT: cmoveq %rcx, %rax 58 ; SSSE3-NEXT: movd %rax, %xmm0 59 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 60 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 61 ; SSSE3-NEXT: retq 62 ; 63 ; SSE41-LABEL: testv2i64: 64 ; SSE41: # BB#0: 65 ; SSE41-NEXT: pextrq $1, %xmm0, %rax 66 ; SSE41-NEXT: bsfq %rax, %rax 67 ; SSE41-NEXT: movl $64, %ecx 68 ; SSE41-NEXT: cmoveq %rcx, %rax 69 ; SSE41-NEXT: movd %rax, %xmm1 70 ; SSE41-NEXT: movd %xmm0, %rax 71 ; SSE41-NEXT: bsfq %rax, %rax 72 ; SSE41-NEXT: cmoveq %rcx, %rax 73 ; SSE41-NEXT: movd %rax, %xmm0 74 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 75 ; SSE41-NEXT: retq 76 ; 77 ; AVX-LABEL: testv2i64: 78 ; AVX: # BB#0: 79 ; AVX-NEXT: vpextrq $1, %xmm0, %rax 80 ; AVX-NEXT: bsfq %rax, %rax 81 ; AVX-NEXT: movl $64, %ecx 82 ; AVX-NEXT: cmoveq %rcx, %rax 83 ; AVX-NEXT: vmovq %rax, %xmm1 84 ; AVX-NEXT: vmovq %xmm0, %rax 85 ; AVX-NEXT: bsfq %rax, %rax 86 ; AVX-NEXT: cmoveq %rcx, %rax 87 ; AVX-NEXT: vmovq %rax, %xmm0 88 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 89 ; AVX-NEXT: retq 90 ; 91 ; X32-SSE-LABEL: testv2i64: 92 ; X32-SSE: # BB#0: 93 ; X32-SSE-NEXT: pushl %esi 94 ; X32-SSE-NEXT: pextrd $3, %xmm0, %eax 95 ; X32-SSE-NEXT: bsfl %eax, %eax 96 ; X32-SSE-NEXT: movl $32, %ecx 97 ; X32-SSE-NEXT: cmovel %ecx, %eax 98 ; X32-SSE-NEXT: addl $32, %eax 99 ; X32-SSE-NEXT: pextrd $2, %xmm0, %edx 100 ; X32-SSE-NEXT: bsfl %edx, %esi 101 ; X32-SSE-NEXT: testl %edx, %edx 102 ; X32-SSE-NEXT: cmovel %eax, %esi 103 ; X32-SSE-NEXT: movd %esi, %xmm1 104 ; X32-SSE-NEXT: pextrd $1, %xmm0, %eax 105 ; X32-SSE-NEXT: bsfl %eax, %eax 106 ; X32-SSE-NEXT: cmovel %ecx, %eax 107 ; X32-SSE-NEXT: addl $32, %eax 108 ; X32-SSE-NEXT: movd %xmm0, %ecx 109 ; X32-SSE-NEXT: bsfl %ecx, %edx 110 ; X32-SSE-NEXT: testl %ecx, %ecx 111 ; X32-SSE-NEXT: cmovel %eax, %edx 112 ; X32-SSE-NEXT: movd %edx, %xmm0 113 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 114 ; X32-SSE-NEXT: popl %esi 115 ; X32-SSE-NEXT: retl 116 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0) 117 ret <2 x i64> %out 118 } 119 120 define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { 121 ; SSE2-LABEL: testv2i64u: 122 ; SSE2: # BB#0: 123 ; SSE2-NEXT: movd %xmm0, %rax 124 ; SSE2-NEXT: bsfq %rax, %rax 125 ; SSE2-NEXT: movd %rax, %xmm1 126 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 127 ; SSE2-NEXT: movd %xmm0, %rax 128 ; SSE2-NEXT: bsfq %rax, %rax 129 ; SSE2-NEXT: movd %rax, %xmm0 130 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 131 ; SSE2-NEXT: movdqa %xmm1, %xmm0 132 ; SSE2-NEXT: retq 133 ; 134 ; SSE3-LABEL: testv2i64u: 135 ; SSE3: # BB#0: 136 ; SSE3-NEXT: movd %xmm0, %rax 137 ; SSE3-NEXT: bsfq %rax, %rax 138 ; SSE3-NEXT: movd %rax, %xmm1 139 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 140 ; SSE3-NEXT: movd %xmm0, %rax 141 ; SSE3-NEXT: bsfq %rax, %rax 142 ; SSE3-NEXT: movd %rax, %xmm0 143 ; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 144 ; SSE3-NEXT: movdqa %xmm1, %xmm0 145 ; SSE3-NEXT: retq 146 ; 147 ; SSSE3-LABEL: testv2i64u: 148 ; SSSE3: # BB#0: 149 ; SSSE3-NEXT: movd %xmm0, %rax 150 ; SSSE3-NEXT: bsfq %rax, %rax 151 ; SSSE3-NEXT: movd %rax, %xmm1 152 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 153 ; SSSE3-NEXT: movd %xmm0, %rax 154 ; SSSE3-NEXT: bsfq %rax, %rax 155 ; SSSE3-NEXT: movd %rax, %xmm0 156 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 157 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 158 ; SSSE3-NEXT: retq 159 ; 160 ; SSE41-LABEL: testv2i64u: 161 ; SSE41: # BB#0: 162 ; SSE41-NEXT: pextrq $1, %xmm0, %rax 163 ; SSE41-NEXT: bsfq %rax, %rax 164 ; SSE41-NEXT: movd %rax, %xmm1 165 ; SSE41-NEXT: movd %xmm0, %rax 166 ; SSE41-NEXT: bsfq %rax, %rax 167 ; SSE41-NEXT: movd %rax, %xmm0 168 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 169 ; SSE41-NEXT: retq 170 ; 171 ; AVX1-LABEL: testv2i64u: 172 ; AVX1: # BB#0: 173 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax 174 ; AVX1-NEXT: bsfq %rax, %rax 175 ; AVX1-NEXT: vmovq %rax, %xmm1 176 ; AVX1-NEXT: vmovq %xmm0, %rax 177 ; AVX1-NEXT: bsfq %rax, %rax 178 ; AVX1-NEXT: vmovq %rax, %xmm0 179 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 180 ; AVX1-NEXT: retq 181 ; 182 ; AVX2-LABEL: testv2i64u: 183 ; AVX2: # BB#0: 184 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax 185 ; AVX2-NEXT: bsfq %rax, %rax 186 ; AVX2-NEXT: vmovq %rax, %xmm1 187 ; AVX2-NEXT: vmovq %xmm0, %rax 188 ; AVX2-NEXT: bsfq %rax, %rax 189 ; AVX2-NEXT: vmovq %rax, %xmm0 190 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 191 ; AVX2-NEXT: retq 192 ; 193 ; AVX512CDVL-LABEL: testv2i64u: 194 ; AVX512CDVL: # BB#0: 195 ; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 196 ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1 197 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 198 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 199 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [63,63] 200 ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 201 ; AVX512CDVL-NEXT: retq 202 ; 203 ; AVX512CD-LABEL: testv2i64u: 204 ; AVX512CD: # BB#0: 205 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 206 ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm1 207 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 208 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 209 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] 210 ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 211 ; AVX512CD-NEXT: retq 212 ; 213 ; X32-SSE-LABEL: testv2i64u: 214 ; X32-SSE: # BB#0: 215 ; X32-SSE-NEXT: pextrd $2, %xmm0, %eax 216 ; X32-SSE-NEXT: bsfl %eax, %ecx 217 ; X32-SSE-NEXT: pextrd $3, %xmm0, %edx 218 ; X32-SSE-NEXT: bsfl %edx, %edx 219 ; X32-SSE-NEXT: addl $32, %edx 220 ; X32-SSE-NEXT: testl %eax, %eax 221 ; X32-SSE-NEXT: cmovnel %ecx, %edx 222 ; X32-SSE-NEXT: movd %edx, %xmm1 223 ; X32-SSE-NEXT: movd %xmm0, %eax 224 ; X32-SSE-NEXT: bsfl %eax, %ecx 225 ; X32-SSE-NEXT: pextrd $1, %xmm0, %edx 226 ; X32-SSE-NEXT: bsfl %edx, %edx 227 ; X32-SSE-NEXT: addl $32, %edx 228 ; X32-SSE-NEXT: testl %eax, %eax 229 ; X32-SSE-NEXT: cmovnel %ecx, %edx 230 ; X32-SSE-NEXT: movd %edx, %xmm0 231 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 232 ; X32-SSE-NEXT: retl 233 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1) 234 ret <2 x i64> %out 235 } 236 237 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 238 ; SSE2-LABEL: testv4i32: 239 ; SSE2: # BB#0: 240 ; SSE2-NEXT: pxor %xmm1, %xmm1 241 ; SSE2-NEXT: pxor %xmm2, %xmm2 242 ; SSE2-NEXT: psubd %xmm0, %xmm2 243 ; SSE2-NEXT: pand %xmm0, %xmm2 244 ; SSE2-NEXT: psubd {{.*}}(%rip), %xmm2 245 ; SSE2-NEXT: movdqa %xmm2, %xmm0 246 ; SSE2-NEXT: psrld $1, %xmm0 247 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 248 ; SSE2-NEXT: psubd %xmm0, %xmm2 249 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 250 ; SSE2-NEXT: movdqa %xmm2, %xmm3 251 ; SSE2-NEXT: pand %xmm0, %xmm3 252 ; SSE2-NEXT: psrld $2, %xmm2 253 ; SSE2-NEXT: pand %xmm0, %xmm2 254 ; SSE2-NEXT: paddd %xmm3, %xmm2 255 ; SSE2-NEXT: movdqa %xmm2, %xmm0 256 ; SSE2-NEXT: psrld $4, %xmm0 257 ; SSE2-NEXT: paddd %xmm2, %xmm0 258 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 259 ; SSE2-NEXT: movdqa %xmm0, %xmm2 260 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 261 ; SSE2-NEXT: psadbw %xmm1, %xmm2 262 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 263 ; SSE2-NEXT: psadbw %xmm1, %xmm0 264 ; SSE2-NEXT: packuswb %xmm2, %xmm0 265 ; SSE2-NEXT: retq 266 ; 267 ; SSE3-LABEL: testv4i32: 268 ; SSE3: # BB#0: 269 ; SSE3-NEXT: pxor %xmm1, %xmm1 270 ; SSE3-NEXT: pxor %xmm2, %xmm2 271 ; SSE3-NEXT: psubd %xmm0, %xmm2 272 ; SSE3-NEXT: pand %xmm0, %xmm2 273 ; SSE3-NEXT: psubd {{.*}}(%rip), %xmm2 274 ; SSE3-NEXT: movdqa %xmm2, %xmm0 275 ; SSE3-NEXT: psrld $1, %xmm0 276 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 277 ; SSE3-NEXT: psubd %xmm0, %xmm2 278 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 279 ; SSE3-NEXT: movdqa %xmm2, %xmm3 280 ; SSE3-NEXT: pand %xmm0, %xmm3 281 ; SSE3-NEXT: psrld $2, %xmm2 282 ; SSE3-NEXT: pand %xmm0, %xmm2 283 ; SSE3-NEXT: paddd %xmm3, %xmm2 284 ; SSE3-NEXT: movdqa %xmm2, %xmm0 285 ; SSE3-NEXT: psrld $4, %xmm0 286 ; SSE3-NEXT: paddd %xmm2, %xmm0 287 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 288 ; SSE3-NEXT: movdqa %xmm0, %xmm2 289 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 290 ; SSE3-NEXT: psadbw %xmm1, %xmm2 291 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 292 ; SSE3-NEXT: psadbw %xmm1, %xmm0 293 ; SSE3-NEXT: packuswb %xmm2, %xmm0 294 ; SSE3-NEXT: retq 295 ; 296 ; SSSE3-LABEL: testv4i32: 297 ; SSSE3: # BB#0: 298 ; SSSE3-NEXT: pxor %xmm1, %xmm1 299 ; SSSE3-NEXT: pxor %xmm2, %xmm2 300 ; SSSE3-NEXT: psubd %xmm0, %xmm2 301 ; SSSE3-NEXT: pand %xmm0, %xmm2 302 ; SSSE3-NEXT: psubd {{.*}}(%rip), %xmm2 303 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 304 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 305 ; SSSE3-NEXT: pand %xmm3, %xmm4 306 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 307 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 308 ; SSSE3-NEXT: pshufb %xmm4, %xmm5 309 ; SSSE3-NEXT: psrlw $4, %xmm2 310 ; SSSE3-NEXT: pand %xmm3, %xmm2 311 ; SSSE3-NEXT: pshufb %xmm2, %xmm0 312 ; SSSE3-NEXT: paddb %xmm5, %xmm0 313 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 314 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 315 ; SSSE3-NEXT: psadbw %xmm1, %xmm2 316 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 317 ; SSSE3-NEXT: psadbw %xmm1, %xmm0 318 ; SSSE3-NEXT: packuswb %xmm2, %xmm0 319 ; SSSE3-NEXT: retq 320 ; 321 ; SSE41-LABEL: testv4i32: 322 ; SSE41: # BB#0: 323 ; SSE41-NEXT: pxor %xmm1, %xmm1 324 ; SSE41-NEXT: pxor %xmm2, %xmm2 325 ; SSE41-NEXT: psubd %xmm0, %xmm2 326 ; SSE41-NEXT: pand %xmm0, %xmm2 327 ; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2 328 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 329 ; SSE41-NEXT: movdqa %xmm2, %xmm4 330 ; SSE41-NEXT: pand %xmm3, %xmm4 331 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 332 ; SSE41-NEXT: movdqa %xmm0, %xmm5 333 ; SSE41-NEXT: pshufb %xmm4, %xmm5 334 ; SSE41-NEXT: psrlw $4, %xmm2 335 ; SSE41-NEXT: pand %xmm3, %xmm2 336 ; SSE41-NEXT: pshufb %xmm2, %xmm0 337 ; SSE41-NEXT: paddb %xmm5, %xmm0 338 ; SSE41-NEXT: movdqa %xmm0, %xmm2 339 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 340 ; SSE41-NEXT: psadbw %xmm1, %xmm2 341 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 342 ; SSE41-NEXT: psadbw %xmm1, %xmm0 343 ; SSE41-NEXT: packuswb %xmm2, %xmm0 344 ; SSE41-NEXT: retq 345 ; 346 ; AVX1-LABEL: testv4i32: 347 ; AVX1: # BB#0: 348 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 349 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2 350 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 351 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 352 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 353 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 354 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 355 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 356 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 357 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 358 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 359 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 360 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 361 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 362 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 363 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 364 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 365 ; AVX1-NEXT: retq 366 ; 367 ; AVX2-LABEL: testv4i32: 368 ; AVX2: # BB#0: 369 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 370 ; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2 371 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 372 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 373 ; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 374 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 375 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 376 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 377 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 378 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 379 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 380 ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 381 ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 382 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 383 ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 384 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 385 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 386 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 387 ; AVX2-NEXT: retq 388 ; 389 ; AVX512CDVL-LABEL: testv4i32: 390 ; AVX512CDVL: # BB#0: 391 ; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 392 ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm2 393 ; AVX512CDVL-NEXT: vpandd %xmm2, %xmm0, %xmm0 394 ; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to4}, %xmm0, %xmm0 395 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 396 ; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm3 397 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 398 ; AVX512CDVL-NEXT: vpshufb %xmm3, %xmm4, %xmm3 399 ; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 400 ; AVX512CDVL-NEXT: vpandq %xmm2, %xmm0, %xmm0 401 ; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm4, %xmm0 402 ; AVX512CDVL-NEXT: vpaddb %xmm3, %xmm0, %xmm0 403 ; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 404 ; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 405 ; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 406 ; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 407 ; AVX512CDVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 408 ; AVX512CDVL-NEXT: retq 409 ; 410 ; AVX512CD-LABEL: testv4i32: 411 ; AVX512CD: # BB#0: 412 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 413 ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm2 414 ; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 415 ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 416 ; AVX512CD-NEXT: vpsubd %xmm2, %xmm0, %xmm0 417 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 418 ; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm3 419 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 420 ; AVX512CD-NEXT: vpshufb %xmm3, %xmm4, %xmm3 421 ; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 422 ; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 423 ; AVX512CD-NEXT: vpshufb %xmm0, %xmm4, %xmm0 424 ; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0 425 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 426 ; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 427 ; AVX512CD-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 428 ; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 429 ; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 430 ; AVX512CD-NEXT: retq 431 ; 432 ; X32-SSE-LABEL: testv4i32: 433 ; X32-SSE: # BB#0: 434 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 435 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 436 ; X32-SSE-NEXT: psubd %xmm0, %xmm2 437 ; X32-SSE-NEXT: pand %xmm0, %xmm2 438 ; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2 439 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 440 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4 441 ; X32-SSE-NEXT: pand %xmm3, %xmm4 442 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 443 ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 444 ; X32-SSE-NEXT: pshufb %xmm4, %xmm5 445 ; X32-SSE-NEXT: psrlw $4, %xmm2 446 ; X32-SSE-NEXT: pand %xmm3, %xmm2 447 ; X32-SSE-NEXT: pshufb %xmm2, %xmm0 448 ; X32-SSE-NEXT: paddb %xmm5, %xmm0 449 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 450 ; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 451 ; X32-SSE-NEXT: psadbw %xmm1, %xmm2 452 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 453 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0 454 ; X32-SSE-NEXT: packuswb %xmm2, %xmm0 455 ; X32-SSE-NEXT: retl 456 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) 457 ret <4 x i32> %out 458 } 459 460 define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { 461 ; SSE2-LABEL: testv4i32u: 462 ; SSE2: # BB#0: 463 ; SSE2-NEXT: pxor %xmm1, %xmm1 464 ; SSE2-NEXT: pxor %xmm2, %xmm2 465 ; SSE2-NEXT: psubd %xmm0, %xmm2 466 ; SSE2-NEXT: pand %xmm0, %xmm2 467 ; SSE2-NEXT: psubd {{.*}}(%rip), %xmm2 468 ; SSE2-NEXT: movdqa %xmm2, %xmm0 469 ; SSE2-NEXT: psrld $1, %xmm0 470 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 471 ; SSE2-NEXT: psubd %xmm0, %xmm2 472 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 473 ; SSE2-NEXT: movdqa %xmm2, %xmm3 474 ; SSE2-NEXT: pand %xmm0, %xmm3 475 ; SSE2-NEXT: psrld $2, %xmm2 476 ; SSE2-NEXT: pand %xmm0, %xmm2 477 ; SSE2-NEXT: paddd %xmm3, %xmm2 478 ; SSE2-NEXT: movdqa %xmm2, %xmm0 479 ; SSE2-NEXT: psrld $4, %xmm0 480 ; SSE2-NEXT: paddd %xmm2, %xmm0 481 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 482 ; SSE2-NEXT: movdqa %xmm0, %xmm2 483 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 484 ; SSE2-NEXT: psadbw %xmm1, %xmm2 485 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 486 ; SSE2-NEXT: psadbw %xmm1, %xmm0 487 ; SSE2-NEXT: packuswb %xmm2, %xmm0 488 ; SSE2-NEXT: retq 489 ; 490 ; SSE3-LABEL: testv4i32u: 491 ; SSE3: # BB#0: 492 ; SSE3-NEXT: pxor %xmm1, %xmm1 493 ; SSE3-NEXT: pxor %xmm2, %xmm2 494 ; SSE3-NEXT: psubd %xmm0, %xmm2 495 ; SSE3-NEXT: pand %xmm0, %xmm2 496 ; SSE3-NEXT: psubd {{.*}}(%rip), %xmm2 497 ; SSE3-NEXT: movdqa %xmm2, %xmm0 498 ; SSE3-NEXT: psrld $1, %xmm0 499 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 500 ; SSE3-NEXT: psubd %xmm0, %xmm2 501 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] 502 ; SSE3-NEXT: movdqa %xmm2, %xmm3 503 ; SSE3-NEXT: pand %xmm0, %xmm3 504 ; SSE3-NEXT: psrld $2, %xmm2 505 ; SSE3-NEXT: pand %xmm0, %xmm2 506 ; SSE3-NEXT: paddd %xmm3, %xmm2 507 ; SSE3-NEXT: movdqa %xmm2, %xmm0 508 ; SSE3-NEXT: psrld $4, %xmm0 509 ; SSE3-NEXT: paddd %xmm2, %xmm0 510 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 511 ; SSE3-NEXT: movdqa %xmm0, %xmm2 512 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 513 ; SSE3-NEXT: psadbw %xmm1, %xmm2 514 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 515 ; SSE3-NEXT: psadbw %xmm1, %xmm0 516 ; SSE3-NEXT: packuswb %xmm2, %xmm0 517 ; SSE3-NEXT: retq 518 ; 519 ; SSSE3-LABEL: testv4i32u: 520 ; SSSE3: # BB#0: 521 ; SSSE3-NEXT: pxor %xmm1, %xmm1 522 ; SSSE3-NEXT: pxor %xmm2, %xmm2 523 ; SSSE3-NEXT: psubd %xmm0, %xmm2 524 ; SSSE3-NEXT: pand %xmm0, %xmm2 525 ; SSSE3-NEXT: psubd {{.*}}(%rip), %xmm2 526 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 527 ; SSSE3-NEXT: movdqa %xmm2, %xmm4 528 ; SSSE3-NEXT: pand %xmm3, %xmm4 529 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 530 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 531 ; SSSE3-NEXT: pshufb %xmm4, %xmm5 532 ; SSSE3-NEXT: psrlw $4, %xmm2 533 ; SSSE3-NEXT: pand %xmm3, %xmm2 534 ; SSSE3-NEXT: pshufb %xmm2, %xmm0 535 ; SSSE3-NEXT: paddb %xmm5, %xmm0 536 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 537 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 538 ; SSSE3-NEXT: psadbw %xmm1, %xmm2 539 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 540 ; SSSE3-NEXT: psadbw %xmm1, %xmm0 541 ; SSSE3-NEXT: packuswb %xmm2, %xmm0 542 ; SSSE3-NEXT: retq 543 ; 544 ; SSE41-LABEL: testv4i32u: 545 ; SSE41: # BB#0: 546 ; SSE41-NEXT: pxor %xmm1, %xmm1 547 ; SSE41-NEXT: pxor %xmm2, %xmm2 548 ; SSE41-NEXT: psubd %xmm0, %xmm2 549 ; SSE41-NEXT: pand %xmm0, %xmm2 550 ; SSE41-NEXT: psubd {{.*}}(%rip), %xmm2 551 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 552 ; SSE41-NEXT: movdqa %xmm2, %xmm4 553 ; SSE41-NEXT: pand %xmm3, %xmm4 554 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 555 ; SSE41-NEXT: movdqa %xmm0, %xmm5 556 ; SSE41-NEXT: pshufb %xmm4, %xmm5 557 ; SSE41-NEXT: psrlw $4, %xmm2 558 ; SSE41-NEXT: pand %xmm3, %xmm2 559 ; SSE41-NEXT: pshufb %xmm2, %xmm0 560 ; SSE41-NEXT: paddb %xmm5, %xmm0 561 ; SSE41-NEXT: movdqa %xmm0, %xmm2 562 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 563 ; SSE41-NEXT: psadbw %xmm1, %xmm2 564 ; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 565 ; SSE41-NEXT: psadbw %xmm1, %xmm0 566 ; SSE41-NEXT: packuswb %xmm2, %xmm0 567 ; SSE41-NEXT: retq 568 ; 569 ; AVX1-LABEL: testv4i32u: 570 ; AVX1: # BB#0: 571 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 572 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2 573 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 574 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 575 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 576 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 577 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 578 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 579 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 580 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 581 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 582 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 583 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 584 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 585 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 586 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 587 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 588 ; AVX1-NEXT: retq 589 ; 590 ; AVX2-LABEL: testv4i32u: 591 ; AVX2: # BB#0: 592 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 593 ; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2 594 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 595 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 596 ; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 597 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 598 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 599 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 600 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 601 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 602 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 603 ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 604 ; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 605 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 606 ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 607 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 608 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 609 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 610 ; AVX2-NEXT: retq 611 ; 612 ; AVX512CDVL-LABEL: testv4i32u: 613 ; AVX512CDVL: # BB#0: 614 ; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 615 ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 616 ; AVX512CDVL-NEXT: vpandd %xmm1, %xmm0, %xmm0 617 ; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 618 ; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 619 ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 620 ; AVX512CDVL-NEXT: retq 621 ; 622 ; AVX512CD-LABEL: testv4i32u: 623 ; AVX512CD: # BB#0: 624 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 625 ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1 626 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 627 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 628 ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 629 ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 630 ; AVX512CD-NEXT: retq 631 ; 632 ; X32-SSE-LABEL: testv4i32u: 633 ; X32-SSE: # BB#0: 634 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 635 ; X32-SSE-NEXT: pxor %xmm2, %xmm2 636 ; X32-SSE-NEXT: psubd %xmm0, %xmm2 637 ; X32-SSE-NEXT: pand %xmm0, %xmm2 638 ; X32-SSE-NEXT: psubd {{\.LCPI.*}}, %xmm2 639 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 640 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4 641 ; X32-SSE-NEXT: pand %xmm3, %xmm4 642 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 643 ; X32-SSE-NEXT: movdqa %xmm0, %xmm5 644 ; X32-SSE-NEXT: pshufb %xmm4, %xmm5 645 ; X32-SSE-NEXT: psrlw $4, %xmm2 646 ; X32-SSE-NEXT: pand %xmm3, %xmm2 647 ; X32-SSE-NEXT: pshufb %xmm2, %xmm0 648 ; X32-SSE-NEXT: paddb %xmm5, %xmm0 649 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 650 ; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 651 ; X32-SSE-NEXT: psadbw %xmm1, %xmm2 652 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 653 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0 654 ; X32-SSE-NEXT: packuswb %xmm2, %xmm0 655 ; X32-SSE-NEXT: retl 656 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) 657 ret <4 x i32> %out 658 } 659 660 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 661 ; SSE2-LABEL: testv8i16: 662 ; SSE2: # BB#0: 663 ; SSE2-NEXT: pxor %xmm1, %xmm1 664 ; SSE2-NEXT: psubw %xmm0, %xmm1 665 ; SSE2-NEXT: pand %xmm0, %xmm1 666 ; SSE2-NEXT: psubw {{.*}}(%rip), %xmm1 667 ; SSE2-NEXT: movdqa %xmm1, %xmm0 668 ; SSE2-NEXT: psrlw $1, %xmm0 669 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 670 ; SSE2-NEXT: psubw %xmm0, %xmm1 671 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 672 ; SSE2-NEXT: movdqa %xmm1, %xmm2 673 ; SSE2-NEXT: pand %xmm0, %xmm2 674 ; SSE2-NEXT: psrlw $2, %xmm1 675 ; SSE2-NEXT: pand %xmm0, %xmm1 676 ; SSE2-NEXT: paddw %xmm2, %xmm1 677 ; SSE2-NEXT: movdqa %xmm1, %xmm2 678 ; SSE2-NEXT: psrlw $4, %xmm2 679 ; SSE2-NEXT: paddw %xmm1, %xmm2 680 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 681 ; SSE2-NEXT: movdqa %xmm2, %xmm0 682 ; SSE2-NEXT: psllw $8, %xmm0 683 ; SSE2-NEXT: paddb %xmm2, %xmm0 684 ; SSE2-NEXT: psrlw $8, %xmm0 685 ; SSE2-NEXT: retq 686 ; 687 ; SSE3-LABEL: testv8i16: 688 ; SSE3: # BB#0: 689 ; SSE3-NEXT: pxor %xmm1, %xmm1 690 ; SSE3-NEXT: psubw %xmm0, %xmm1 691 ; SSE3-NEXT: pand %xmm0, %xmm1 692 ; SSE3-NEXT: psubw {{.*}}(%rip), %xmm1 693 ; SSE3-NEXT: movdqa %xmm1, %xmm0 694 ; SSE3-NEXT: psrlw $1, %xmm0 695 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 696 ; SSE3-NEXT: psubw %xmm0, %xmm1 697 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 698 ; SSE3-NEXT: movdqa %xmm1, %xmm2 699 ; SSE3-NEXT: pand %xmm0, %xmm2 700 ; SSE3-NEXT: psrlw $2, %xmm1 701 ; SSE3-NEXT: pand %xmm0, %xmm1 702 ; SSE3-NEXT: paddw %xmm2, %xmm1 703 ; SSE3-NEXT: movdqa %xmm1, %xmm2 704 ; SSE3-NEXT: psrlw $4, %xmm2 705 ; SSE3-NEXT: paddw %xmm1, %xmm2 706 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 707 ; SSE3-NEXT: movdqa %xmm2, %xmm0 708 ; SSE3-NEXT: psllw $8, %xmm0 709 ; SSE3-NEXT: paddb %xmm2, %xmm0 710 ; SSE3-NEXT: psrlw $8, %xmm0 711 ; SSE3-NEXT: retq 712 ; 713 ; SSSE3-LABEL: testv8i16: 714 ; SSSE3: # BB#0: 715 ; SSSE3-NEXT: pxor %xmm1, %xmm1 716 ; SSSE3-NEXT: psubw %xmm0, %xmm1 717 ; SSSE3-NEXT: pand %xmm0, %xmm1 718 ; SSSE3-NEXT: psubw {{.*}}(%rip), %xmm1 719 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 720 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 721 ; SSSE3-NEXT: pand %xmm0, %xmm2 722 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 723 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 724 ; SSSE3-NEXT: pshufb %xmm2, %xmm4 725 ; SSSE3-NEXT: psrlw $4, %xmm1 726 ; SSSE3-NEXT: pand %xmm0, %xmm1 727 ; SSSE3-NEXT: pshufb %xmm1, %xmm3 728 ; SSSE3-NEXT: paddb %xmm4, %xmm3 729 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 730 ; SSSE3-NEXT: psllw $8, %xmm0 731 ; SSSE3-NEXT: paddb %xmm3, %xmm0 732 ; SSSE3-NEXT: psrlw $8, %xmm0 733 ; SSSE3-NEXT: retq 734 ; 735 ; SSE41-LABEL: testv8i16: 736 ; SSE41: # BB#0: 737 ; SSE41-NEXT: pxor %xmm1, %xmm1 738 ; SSE41-NEXT: psubw %xmm0, %xmm1 739 ; SSE41-NEXT: pand %xmm0, %xmm1 740 ; SSE41-NEXT: psubw {{.*}}(%rip), %xmm1 741 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 742 ; SSE41-NEXT: movdqa %xmm1, %xmm2 743 ; SSE41-NEXT: pand %xmm0, %xmm2 744 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 745 ; SSE41-NEXT: movdqa %xmm3, %xmm4 746 ; SSE41-NEXT: pshufb %xmm2, %xmm4 747 ; SSE41-NEXT: psrlw $4, %xmm1 748 ; SSE41-NEXT: pand %xmm0, %xmm1 749 ; SSE41-NEXT: pshufb %xmm1, %xmm3 750 ; SSE41-NEXT: paddb %xmm4, %xmm3 751 ; SSE41-NEXT: movdqa %xmm3, %xmm0 752 ; SSE41-NEXT: psllw $8, %xmm0 753 ; SSE41-NEXT: paddb %xmm3, %xmm0 754 ; SSE41-NEXT: psrlw $8, %xmm0 755 ; SSE41-NEXT: retq 756 ; 757 ; AVX1-LABEL: testv8i16: 758 ; AVX1: # BB#0: 759 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 760 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 761 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 762 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 763 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 764 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 765 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 766 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 767 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 768 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 769 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 770 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 771 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 772 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 773 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 774 ; AVX1-NEXT: retq 775 ; 776 ; AVX2-LABEL: testv8i16: 777 ; AVX2: # BB#0: 778 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 779 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 780 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 781 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 782 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 783 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 784 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 785 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 786 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 787 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 788 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 789 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 790 ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 791 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 792 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 793 ; AVX2-NEXT: retq 794 ; 795 ; AVX512CDVL-LABEL: testv8i16: 796 ; AVX512CDVL: # BB#0: 797 ; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 798 ; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 799 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 800 ; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 801 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 802 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 803 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 804 ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 805 ; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 806 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 807 ; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 808 ; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 809 ; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 810 ; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 811 ; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 812 ; AVX512CDVL-NEXT: retq 813 ; 814 ; AVX512CD-LABEL: testv8i16: 815 ; AVX512CD: # BB#0: 816 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 817 ; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1 818 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 819 ; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 820 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 821 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 822 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 823 ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 824 ; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 825 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 826 ; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 827 ; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 828 ; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 829 ; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 830 ; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 831 ; AVX512CD-NEXT: retq 832 ; 833 ; X32-SSE-LABEL: testv8i16: 834 ; X32-SSE: # BB#0: 835 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 836 ; X32-SSE-NEXT: psubw %xmm0, %xmm1 837 ; X32-SSE-NEXT: pand %xmm0, %xmm1 838 ; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1 839 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 840 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2 841 ; X32-SSE-NEXT: pand %xmm0, %xmm2 842 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 843 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 844 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4 845 ; X32-SSE-NEXT: psrlw $4, %xmm1 846 ; X32-SSE-NEXT: pand %xmm0, %xmm1 847 ; X32-SSE-NEXT: pshufb %xmm1, %xmm3 848 ; X32-SSE-NEXT: paddb %xmm4, %xmm3 849 ; X32-SSE-NEXT: movdqa %xmm3, %xmm0 850 ; X32-SSE-NEXT: psllw $8, %xmm0 851 ; X32-SSE-NEXT: paddb %xmm3, %xmm0 852 ; X32-SSE-NEXT: psrlw $8, %xmm0 853 ; X32-SSE-NEXT: retl 854 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0) 855 ret <8 x i16> %out 856 } 857 858 define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { 859 ; SSE2-LABEL: testv8i16u: 860 ; SSE2: # BB#0: 861 ; SSE2-NEXT: pxor %xmm1, %xmm1 862 ; SSE2-NEXT: psubw %xmm0, %xmm1 863 ; SSE2-NEXT: pand %xmm0, %xmm1 864 ; SSE2-NEXT: psubw {{.*}}(%rip), %xmm1 865 ; SSE2-NEXT: movdqa %xmm1, %xmm0 866 ; SSE2-NEXT: psrlw $1, %xmm0 867 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 868 ; SSE2-NEXT: psubw %xmm0, %xmm1 869 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 870 ; SSE2-NEXT: movdqa %xmm1, %xmm2 871 ; SSE2-NEXT: pand %xmm0, %xmm2 872 ; SSE2-NEXT: psrlw $2, %xmm1 873 ; SSE2-NEXT: pand %xmm0, %xmm1 874 ; SSE2-NEXT: paddw %xmm2, %xmm1 875 ; SSE2-NEXT: movdqa %xmm1, %xmm2 876 ; SSE2-NEXT: psrlw $4, %xmm2 877 ; SSE2-NEXT: paddw %xmm1, %xmm2 878 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 879 ; SSE2-NEXT: movdqa %xmm2, %xmm0 880 ; SSE2-NEXT: psllw $8, %xmm0 881 ; SSE2-NEXT: paddb %xmm2, %xmm0 882 ; SSE2-NEXT: psrlw $8, %xmm0 883 ; SSE2-NEXT: retq 884 ; 885 ; SSE3-LABEL: testv8i16u: 886 ; SSE3: # BB#0: 887 ; SSE3-NEXT: pxor %xmm1, %xmm1 888 ; SSE3-NEXT: psubw %xmm0, %xmm1 889 ; SSE3-NEXT: pand %xmm0, %xmm1 890 ; SSE3-NEXT: psubw {{.*}}(%rip), %xmm1 891 ; SSE3-NEXT: movdqa %xmm1, %xmm0 892 ; SSE3-NEXT: psrlw $1, %xmm0 893 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 894 ; SSE3-NEXT: psubw %xmm0, %xmm1 895 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] 896 ; SSE3-NEXT: movdqa %xmm1, %xmm2 897 ; SSE3-NEXT: pand %xmm0, %xmm2 898 ; SSE3-NEXT: psrlw $2, %xmm1 899 ; SSE3-NEXT: pand %xmm0, %xmm1 900 ; SSE3-NEXT: paddw %xmm2, %xmm1 901 ; SSE3-NEXT: movdqa %xmm1, %xmm2 902 ; SSE3-NEXT: psrlw $4, %xmm2 903 ; SSE3-NEXT: paddw %xmm1, %xmm2 904 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 905 ; SSE3-NEXT: movdqa %xmm2, %xmm0 906 ; SSE3-NEXT: psllw $8, %xmm0 907 ; SSE3-NEXT: paddb %xmm2, %xmm0 908 ; SSE3-NEXT: psrlw $8, %xmm0 909 ; SSE3-NEXT: retq 910 ; 911 ; SSSE3-LABEL: testv8i16u: 912 ; SSSE3: # BB#0: 913 ; SSSE3-NEXT: pxor %xmm1, %xmm1 914 ; SSSE3-NEXT: psubw %xmm0, %xmm1 915 ; SSSE3-NEXT: pand %xmm0, %xmm1 916 ; SSSE3-NEXT: psubw {{.*}}(%rip), %xmm1 917 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 918 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 919 ; SSSE3-NEXT: pand %xmm0, %xmm2 920 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 921 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 922 ; SSSE3-NEXT: pshufb %xmm2, %xmm4 923 ; SSSE3-NEXT: psrlw $4, %xmm1 924 ; SSSE3-NEXT: pand %xmm0, %xmm1 925 ; SSSE3-NEXT: pshufb %xmm1, %xmm3 926 ; SSSE3-NEXT: paddb %xmm4, %xmm3 927 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 928 ; SSSE3-NEXT: psllw $8, %xmm0 929 ; SSSE3-NEXT: paddb %xmm3, %xmm0 930 ; SSSE3-NEXT: psrlw $8, %xmm0 931 ; SSSE3-NEXT: retq 932 ; 933 ; SSE41-LABEL: testv8i16u: 934 ; SSE41: # BB#0: 935 ; SSE41-NEXT: pxor %xmm1, %xmm1 936 ; SSE41-NEXT: psubw %xmm0, %xmm1 937 ; SSE41-NEXT: pand %xmm0, %xmm1 938 ; SSE41-NEXT: psubw {{.*}}(%rip), %xmm1 939 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 940 ; SSE41-NEXT: movdqa %xmm1, %xmm2 941 ; SSE41-NEXT: pand %xmm0, %xmm2 942 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 943 ; SSE41-NEXT: movdqa %xmm3, %xmm4 944 ; SSE41-NEXT: pshufb %xmm2, %xmm4 945 ; SSE41-NEXT: psrlw $4, %xmm1 946 ; SSE41-NEXT: pand %xmm0, %xmm1 947 ; SSE41-NEXT: pshufb %xmm1, %xmm3 948 ; SSE41-NEXT: paddb %xmm4, %xmm3 949 ; SSE41-NEXT: movdqa %xmm3, %xmm0 950 ; SSE41-NEXT: psllw $8, %xmm0 951 ; SSE41-NEXT: paddb %xmm3, %xmm0 952 ; SSE41-NEXT: psrlw $8, %xmm0 953 ; SSE41-NEXT: retq 954 ; 955 ; AVX1-LABEL: testv8i16u: 956 ; AVX1: # BB#0: 957 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 958 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 959 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 960 ; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 961 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 962 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 963 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 964 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 965 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 966 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 967 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 968 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 969 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 970 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 971 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 972 ; AVX1-NEXT: retq 973 ; 974 ; AVX2-LABEL: testv8i16u: 975 ; AVX2: # BB#0: 976 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 977 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 978 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 979 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 980 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 981 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 982 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 983 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 984 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 985 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 986 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 987 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 988 ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 989 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 990 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 991 ; AVX2-NEXT: retq 992 ; 993 ; AVX512CDVL-LABEL: testv8i16u: 994 ; AVX512CDVL: # BB#0: 995 ; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 996 ; AVX512CDVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 997 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 998 ; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 999 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1000 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 1001 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1002 ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1003 ; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 1004 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1005 ; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1006 ; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1007 ; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 1008 ; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1009 ; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 1010 ; AVX512CDVL-NEXT: retq 1011 ; 1012 ; AVX512CD-LABEL: testv8i16u: 1013 ; AVX512CD: # BB#0: 1014 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1015 ; AVX512CD-NEXT: vpsubw %xmm0, %xmm1, %xmm1 1016 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1017 ; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 1018 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1019 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 1020 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1021 ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1022 ; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 1023 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1024 ; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1025 ; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1026 ; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 1027 ; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 1028 ; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 1029 ; AVX512CD-NEXT: retq 1030 ; 1031 ; X32-SSE-LABEL: testv8i16u: 1032 ; X32-SSE: # BB#0: 1033 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 1034 ; X32-SSE-NEXT: psubw %xmm0, %xmm1 1035 ; X32-SSE-NEXT: pand %xmm0, %xmm1 1036 ; X32-SSE-NEXT: psubw {{\.LCPI.*}}, %xmm1 1037 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1038 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2 1039 ; X32-SSE-NEXT: pand %xmm0, %xmm2 1040 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1041 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 1042 ; X32-SSE-NEXT: pshufb %xmm2, %xmm4 1043 ; X32-SSE-NEXT: psrlw $4, %xmm1 1044 ; X32-SSE-NEXT: pand %xmm0, %xmm1 1045 ; X32-SSE-NEXT: pshufb %xmm1, %xmm3 1046 ; X32-SSE-NEXT: paddb %xmm4, %xmm3 1047 ; X32-SSE-NEXT: movdqa %xmm3, %xmm0 1048 ; X32-SSE-NEXT: psllw $8, %xmm0 1049 ; X32-SSE-NEXT: paddb %xmm3, %xmm0 1050 ; X32-SSE-NEXT: psrlw $8, %xmm0 1051 ; X32-SSE-NEXT: retl 1052 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1) 1053 ret <8 x i16> %out 1054 } 1055 1056 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 1057 ; SSE2-LABEL: testv16i8: 1058 ; SSE2: # BB#0: 1059 ; SSE2-NEXT: pxor %xmm1, %xmm1 1060 ; SSE2-NEXT: psubb %xmm0, %xmm1 1061 ; SSE2-NEXT: pand %xmm0, %xmm1 1062 ; SSE2-NEXT: psubb {{.*}}(%rip), %xmm1 1063 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1064 ; SSE2-NEXT: psrlw $1, %xmm0 1065 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1066 ; SSE2-NEXT: psubb %xmm0, %xmm1 1067 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1068 ; SSE2-NEXT: movdqa %xmm1, %xmm2 1069 ; SSE2-NEXT: pand %xmm0, %xmm2 1070 ; SSE2-NEXT: psrlw $2, %xmm1 1071 ; SSE2-NEXT: pand %xmm0, %xmm1 1072 ; SSE2-NEXT: paddb %xmm2, %xmm1 1073 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1074 ; SSE2-NEXT: psrlw $4, %xmm0 1075 ; SSE2-NEXT: paddb %xmm1, %xmm0 1076 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1077 ; SSE2-NEXT: retq 1078 ; 1079 ; SSE3-LABEL: testv16i8: 1080 ; SSE3: # BB#0: 1081 ; SSE3-NEXT: pxor %xmm1, %xmm1 1082 ; SSE3-NEXT: psubb %xmm0, %xmm1 1083 ; SSE3-NEXT: pand %xmm0, %xmm1 1084 ; SSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1085 ; SSE3-NEXT: movdqa %xmm1, %xmm0 1086 ; SSE3-NEXT: psrlw $1, %xmm0 1087 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1088 ; SSE3-NEXT: psubb %xmm0, %xmm1 1089 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1090 ; SSE3-NEXT: movdqa %xmm1, %xmm2 1091 ; SSE3-NEXT: pand %xmm0, %xmm2 1092 ; SSE3-NEXT: psrlw $2, %xmm1 1093 ; SSE3-NEXT: pand %xmm0, %xmm1 1094 ; SSE3-NEXT: paddb %xmm2, %xmm1 1095 ; SSE3-NEXT: movdqa %xmm1, %xmm0 1096 ; SSE3-NEXT: psrlw $4, %xmm0 1097 ; SSE3-NEXT: paddb %xmm1, %xmm0 1098 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1099 ; SSE3-NEXT: retq 1100 ; 1101 ; SSSE3-LABEL: testv16i8: 1102 ; SSSE3: # BB#0: 1103 ; SSSE3-NEXT: pxor %xmm1, %xmm1 1104 ; SSSE3-NEXT: psubb %xmm0, %xmm1 1105 ; SSSE3-NEXT: pand %xmm0, %xmm1 1106 ; SSSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1107 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1108 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 1109 ; SSSE3-NEXT: pand %xmm2, %xmm3 1110 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1111 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 1112 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 1113 ; SSSE3-NEXT: psrlw $4, %xmm1 1114 ; SSSE3-NEXT: pand %xmm2, %xmm1 1115 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 1116 ; SSSE3-NEXT: paddb %xmm4, %xmm0 1117 ; SSSE3-NEXT: retq 1118 ; 1119 ; SSE41-LABEL: testv16i8: 1120 ; SSE41: # BB#0: 1121 ; SSE41-NEXT: pxor %xmm1, %xmm1 1122 ; SSE41-NEXT: psubb %xmm0, %xmm1 1123 ; SSE41-NEXT: pand %xmm0, %xmm1 1124 ; SSE41-NEXT: psubb {{.*}}(%rip), %xmm1 1125 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1126 ; SSE41-NEXT: movdqa %xmm1, %xmm3 1127 ; SSE41-NEXT: pand %xmm2, %xmm3 1128 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1129 ; SSE41-NEXT: movdqa %xmm0, %xmm4 1130 ; SSE41-NEXT: pshufb %xmm3, %xmm4 1131 ; SSE41-NEXT: psrlw $4, %xmm1 1132 ; SSE41-NEXT: pand %xmm2, %xmm1 1133 ; SSE41-NEXT: pshufb %xmm1, %xmm0 1134 ; SSE41-NEXT: paddb %xmm4, %xmm0 1135 ; SSE41-NEXT: retq 1136 ; 1137 ; AVX1-LABEL: testv16i8: 1138 ; AVX1: # BB#0: 1139 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1140 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1141 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1142 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1143 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1144 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 1145 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1146 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1147 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1148 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1149 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1150 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1151 ; AVX1-NEXT: retq 1152 ; 1153 ; AVX2-LABEL: testv16i8: 1154 ; AVX2: # BB#0: 1155 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1156 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1157 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1158 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1159 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1160 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 1161 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1162 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1163 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 1164 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1165 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1166 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1167 ; AVX2-NEXT: retq 1168 ; 1169 ; AVX512CDVL-LABEL: testv16i8: 1170 ; AVX512CDVL: # BB#0: 1171 ; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 1172 ; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1173 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1174 ; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1175 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1176 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 1177 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1178 ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1179 ; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 1180 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1181 ; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1182 ; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1183 ; AVX512CDVL-NEXT: retq 1184 ; 1185 ; AVX512CD-LABEL: testv16i8: 1186 ; AVX512CD: # BB#0: 1187 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1188 ; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1189 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1190 ; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1191 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1192 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 1193 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1194 ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1195 ; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 1196 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1197 ; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1198 ; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1199 ; AVX512CD-NEXT: retq 1200 ; 1201 ; X32-SSE-LABEL: testv16i8: 1202 ; X32-SSE: # BB#0: 1203 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 1204 ; X32-SSE-NEXT: psubb %xmm0, %xmm1 1205 ; X32-SSE-NEXT: pand %xmm0, %xmm1 1206 ; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1 1207 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1208 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1209 ; X32-SSE-NEXT: pand %xmm2, %xmm3 1210 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1211 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 1212 ; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1213 ; X32-SSE-NEXT: psrlw $4, %xmm1 1214 ; X32-SSE-NEXT: pand %xmm2, %xmm1 1215 ; X32-SSE-NEXT: pshufb %xmm1, %xmm0 1216 ; X32-SSE-NEXT: paddb %xmm4, %xmm0 1217 ; X32-SSE-NEXT: retl 1218 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0) 1219 ret <16 x i8> %out 1220 } 1221 1222 define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { 1223 ; SSE2-LABEL: testv16i8u: 1224 ; SSE2: # BB#0: 1225 ; SSE2-NEXT: pxor %xmm1, %xmm1 1226 ; SSE2-NEXT: psubb %xmm0, %xmm1 1227 ; SSE2-NEXT: pand %xmm0, %xmm1 1228 ; SSE2-NEXT: psubb {{.*}}(%rip), %xmm1 1229 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1230 ; SSE2-NEXT: psrlw $1, %xmm0 1231 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1232 ; SSE2-NEXT: psubb %xmm0, %xmm1 1233 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1234 ; SSE2-NEXT: movdqa %xmm1, %xmm2 1235 ; SSE2-NEXT: pand %xmm0, %xmm2 1236 ; SSE2-NEXT: psrlw $2, %xmm1 1237 ; SSE2-NEXT: pand %xmm0, %xmm1 1238 ; SSE2-NEXT: paddb %xmm2, %xmm1 1239 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1240 ; SSE2-NEXT: psrlw $4, %xmm0 1241 ; SSE2-NEXT: paddb %xmm1, %xmm0 1242 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1243 ; SSE2-NEXT: retq 1244 ; 1245 ; SSE3-LABEL: testv16i8u: 1246 ; SSE3: # BB#0: 1247 ; SSE3-NEXT: pxor %xmm1, %xmm1 1248 ; SSE3-NEXT: psubb %xmm0, %xmm1 1249 ; SSE3-NEXT: pand %xmm0, %xmm1 1250 ; SSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1251 ; SSE3-NEXT: movdqa %xmm1, %xmm0 1252 ; SSE3-NEXT: psrlw $1, %xmm0 1253 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1254 ; SSE3-NEXT: psubb %xmm0, %xmm1 1255 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1256 ; SSE3-NEXT: movdqa %xmm1, %xmm2 1257 ; SSE3-NEXT: pand %xmm0, %xmm2 1258 ; SSE3-NEXT: psrlw $2, %xmm1 1259 ; SSE3-NEXT: pand %xmm0, %xmm1 1260 ; SSE3-NEXT: paddb %xmm2, %xmm1 1261 ; SSE3-NEXT: movdqa %xmm1, %xmm0 1262 ; SSE3-NEXT: psrlw $4, %xmm0 1263 ; SSE3-NEXT: paddb %xmm1, %xmm0 1264 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 1265 ; SSE3-NEXT: retq 1266 ; 1267 ; SSSE3-LABEL: testv16i8u: 1268 ; SSSE3: # BB#0: 1269 ; SSSE3-NEXT: pxor %xmm1, %xmm1 1270 ; SSSE3-NEXT: psubb %xmm0, %xmm1 1271 ; SSSE3-NEXT: pand %xmm0, %xmm1 1272 ; SSSE3-NEXT: psubb {{.*}}(%rip), %xmm1 1273 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1274 ; SSSE3-NEXT: movdqa %xmm1, %xmm3 1275 ; SSSE3-NEXT: pand %xmm2, %xmm3 1276 ; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1277 ; SSSE3-NEXT: movdqa %xmm0, %xmm4 1278 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 1279 ; SSSE3-NEXT: psrlw $4, %xmm1 1280 ; SSSE3-NEXT: pand %xmm2, %xmm1 1281 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 1282 ; SSSE3-NEXT: paddb %xmm4, %xmm0 1283 ; SSSE3-NEXT: retq 1284 ; 1285 ; SSE41-LABEL: testv16i8u: 1286 ; SSE41: # BB#0: 1287 ; SSE41-NEXT: pxor %xmm1, %xmm1 1288 ; SSE41-NEXT: psubb %xmm0, %xmm1 1289 ; SSE41-NEXT: pand %xmm0, %xmm1 1290 ; SSE41-NEXT: psubb {{.*}}(%rip), %xmm1 1291 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1292 ; SSE41-NEXT: movdqa %xmm1, %xmm3 1293 ; SSE41-NEXT: pand %xmm2, %xmm3 1294 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1295 ; SSE41-NEXT: movdqa %xmm0, %xmm4 1296 ; SSE41-NEXT: pshufb %xmm3, %xmm4 1297 ; SSE41-NEXT: psrlw $4, %xmm1 1298 ; SSE41-NEXT: pand %xmm2, %xmm1 1299 ; SSE41-NEXT: pshufb %xmm1, %xmm0 1300 ; SSE41-NEXT: paddb %xmm4, %xmm0 1301 ; SSE41-NEXT: retq 1302 ; 1303 ; AVX1-LABEL: testv16i8u: 1304 ; AVX1: # BB#0: 1305 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1306 ; AVX1-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1307 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1308 ; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1309 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1310 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 1311 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1312 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1313 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1314 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1315 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1316 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1317 ; AVX1-NEXT: retq 1318 ; 1319 ; AVX2-LABEL: testv16i8u: 1320 ; AVX2: # BB#0: 1321 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1322 ; AVX2-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1323 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1324 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1325 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1326 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 1327 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1328 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1329 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 1330 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1331 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1332 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1333 ; AVX2-NEXT: retq 1334 ; 1335 ; AVX512CDVL-LABEL: testv16i8u: 1336 ; AVX512CDVL: # BB#0: 1337 ; AVX512CDVL-NEXT: vpxord %xmm1, %xmm1, %xmm1 1338 ; AVX512CDVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1339 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1340 ; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1341 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1342 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm2 1343 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1344 ; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1345 ; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 1346 ; AVX512CDVL-NEXT: vpandq %xmm1, %xmm0, %xmm0 1347 ; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1348 ; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1349 ; AVX512CDVL-NEXT: retq 1350 ; 1351 ; AVX512CD-LABEL: testv16i8u: 1352 ; AVX512CD: # BB#0: 1353 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1354 ; AVX512CD-NEXT: vpsubb %xmm0, %xmm1, %xmm1 1355 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1356 ; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 1357 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1358 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 1359 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1360 ; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1361 ; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 1362 ; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 1363 ; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 1364 ; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1365 ; AVX512CD-NEXT: retq 1366 ; 1367 ; X32-SSE-LABEL: testv16i8u: 1368 ; X32-SSE: # BB#0: 1369 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 1370 ; X32-SSE-NEXT: psubb %xmm0, %xmm1 1371 ; X32-SSE-NEXT: pand %xmm0, %xmm1 1372 ; X32-SSE-NEXT: psubb {{\.LCPI.*}}, %xmm1 1373 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1374 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3 1375 ; X32-SSE-NEXT: pand %xmm2, %xmm3 1376 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1377 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 1378 ; X32-SSE-NEXT: pshufb %xmm3, %xmm4 1379 ; X32-SSE-NEXT: psrlw $4, %xmm1 1380 ; X32-SSE-NEXT: pand %xmm2, %xmm1 1381 ; X32-SSE-NEXT: pshufb %xmm1, %xmm0 1382 ; X32-SSE-NEXT: paddb %xmm4, %xmm0 1383 ; X32-SSE-NEXT: retl 1384 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1) 1385 ret <16 x i8> %out 1386 } 1387 1388 define <2 x i64> @foldv2i64() nounwind { 1389 ; SSE-LABEL: foldv2i64: 1390 ; SSE: # BB#0: 1391 ; SSE-NEXT: movl $8, %eax 1392 ; SSE-NEXT: movd %rax, %xmm0 1393 ; SSE-NEXT: retq 1394 ; 1395 ; AVX-LABEL: foldv2i64: 1396 ; AVX: # BB#0: 1397 ; AVX-NEXT: movl $8, %eax 1398 ; AVX-NEXT: vmovq %rax, %xmm0 1399 ; AVX-NEXT: retq 1400 ; 1401 ; X32-SSE-LABEL: foldv2i64: 1402 ; X32-SSE: # BB#0: 1403 ; X32-SSE-NEXT: movl $8, %eax 1404 ; X32-SSE-NEXT: movd %eax, %xmm0 1405 ; X32-SSE-NEXT: retl 1406 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0) 1407 ret <2 x i64> %out 1408 } 1409 1410 define <2 x i64> @foldv2i64u() nounwind { 1411 ; SSE-LABEL: foldv2i64u: 1412 ; SSE: # BB#0: 1413 ; SSE-NEXT: movl $8, %eax 1414 ; SSE-NEXT: movd %rax, %xmm0 1415 ; SSE-NEXT: retq 1416 ; 1417 ; AVX-LABEL: foldv2i64u: 1418 ; AVX: # BB#0: 1419 ; AVX-NEXT: movl $8, %eax 1420 ; AVX-NEXT: vmovq %rax, %xmm0 1421 ; AVX-NEXT: retq 1422 ; 1423 ; X32-SSE-LABEL: foldv2i64u: 1424 ; X32-SSE: # BB#0: 1425 ; X32-SSE-NEXT: movl $8, %eax 1426 ; X32-SSE-NEXT: movd %eax, %xmm0 1427 ; X32-SSE-NEXT: retl 1428 %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1) 1429 ret <2 x i64> %out 1430 } 1431 1432 define <4 x i32> @foldv4i32() nounwind { 1433 ; SSE-LABEL: foldv4i32: 1434 ; SSE: # BB#0: 1435 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1436 ; SSE-NEXT: retq 1437 ; 1438 ; AVX1-LABEL: foldv4i32: 1439 ; AVX1: # BB#0: 1440 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1441 ; AVX1-NEXT: retq 1442 ; 1443 ; AVX2-LABEL: foldv4i32: 1444 ; AVX2: # BB#0: 1445 ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1446 ; AVX2-NEXT: retq 1447 ; 1448 ; AVX512CDVL-LABEL: foldv4i32: 1449 ; AVX512CDVL: # BB#0: 1450 ; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0] 1451 ; AVX512CDVL-NEXT: retq 1452 ; 1453 ; AVX512CD-LABEL: foldv4i32: 1454 ; AVX512CD: # BB#0: 1455 ; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1456 ; AVX512CD-NEXT: retq 1457 ; 1458 ; X32-SSE-LABEL: foldv4i32: 1459 ; X32-SSE: # BB#0: 1460 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1461 ; X32-SSE-NEXT: retl 1462 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0) 1463 ret <4 x i32> %out 1464 } 1465 1466 define <4 x i32> @foldv4i32u() nounwind { 1467 ; SSE-LABEL: foldv4i32u: 1468 ; SSE: # BB#0: 1469 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1470 ; SSE-NEXT: retq 1471 ; 1472 ; AVX1-LABEL: foldv4i32u: 1473 ; AVX1: # BB#0: 1474 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1475 ; AVX1-NEXT: retq 1476 ; 1477 ; AVX2-LABEL: foldv4i32u: 1478 ; AVX2: # BB#0: 1479 ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1480 ; AVX2-NEXT: retq 1481 ; 1482 ; AVX512CDVL-LABEL: foldv4i32u: 1483 ; AVX512CDVL: # BB#0: 1484 ; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} xmm0 = [8,0,32,0] 1485 ; AVX512CDVL-NEXT: retq 1486 ; 1487 ; AVX512CD-LABEL: foldv4i32u: 1488 ; AVX512CD: # BB#0: 1489 ; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] 1490 ; AVX512CD-NEXT: retq 1491 ; 1492 ; X32-SSE-LABEL: foldv4i32u: 1493 ; X32-SSE: # BB#0: 1494 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] 1495 ; X32-SSE-NEXT: retl 1496 %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1) 1497 ret <4 x i32> %out 1498 } 1499 1500 define <8 x i16> @foldv8i16() nounwind { 1501 ; SSE-LABEL: foldv8i16: 1502 ; SSE: # BB#0: 1503 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1504 ; SSE-NEXT: retq 1505 ; 1506 ; AVX1-LABEL: foldv8i16: 1507 ; AVX1: # BB#0: 1508 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1509 ; AVX1-NEXT: retq 1510 ; 1511 ; AVX2-LABEL: foldv8i16: 1512 ; AVX2: # BB#0: 1513 ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1514 ; AVX2-NEXT: retq 1515 ; 1516 ; AVX512CDVL-LABEL: foldv8i16: 1517 ; AVX512CDVL: # BB#0: 1518 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1519 ; AVX512CDVL-NEXT: retq 1520 ; 1521 ; AVX512CD-LABEL: foldv8i16: 1522 ; AVX512CD: # BB#0: 1523 ; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1524 ; AVX512CD-NEXT: retq 1525 ; 1526 ; X32-SSE-LABEL: foldv8i16: 1527 ; X32-SSE: # BB#0: 1528 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1529 ; X32-SSE-NEXT: retl 1530 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0) 1531 ret <8 x i16> %out 1532 } 1533 1534 define <8 x i16> @foldv8i16u() nounwind { 1535 ; SSE-LABEL: foldv8i16u: 1536 ; SSE: # BB#0: 1537 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1538 ; SSE-NEXT: retq 1539 ; 1540 ; AVX1-LABEL: foldv8i16u: 1541 ; AVX1: # BB#0: 1542 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1543 ; AVX1-NEXT: retq 1544 ; 1545 ; AVX2-LABEL: foldv8i16u: 1546 ; AVX2: # BB#0: 1547 ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1548 ; AVX2-NEXT: retq 1549 ; 1550 ; AVX512CDVL-LABEL: foldv8i16u: 1551 ; AVX512CDVL: # BB#0: 1552 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1553 ; AVX512CDVL-NEXT: retq 1554 ; 1555 ; AVX512CD-LABEL: foldv8i16u: 1556 ; AVX512CD: # BB#0: 1557 ; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1558 ; AVX512CD-NEXT: retq 1559 ; 1560 ; X32-SSE-LABEL: foldv8i16u: 1561 ; X32-SSE: # BB#0: 1562 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] 1563 ; X32-SSE-NEXT: retl 1564 %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1) 1565 ret <8 x i16> %out 1566 } 1567 1568 define <16 x i8> @foldv16i8() nounwind { 1569 ; SSE-LABEL: foldv16i8: 1570 ; SSE: # BB#0: 1571 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1572 ; SSE-NEXT: retq 1573 ; 1574 ; AVX1-LABEL: foldv16i8: 1575 ; AVX1: # BB#0: 1576 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1577 ; AVX1-NEXT: retq 1578 ; 1579 ; AVX2-LABEL: foldv16i8: 1580 ; AVX2: # BB#0: 1581 ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1582 ; AVX2-NEXT: retq 1583 ; 1584 ; AVX512CDVL-LABEL: foldv16i8: 1585 ; AVX512CDVL: # BB#0: 1586 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1587 ; AVX512CDVL-NEXT: retq 1588 ; 1589 ; AVX512CD-LABEL: foldv16i8: 1590 ; AVX512CD: # BB#0: 1591 ; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1592 ; AVX512CD-NEXT: retq 1593 ; 1594 ; X32-SSE-LABEL: foldv16i8: 1595 ; X32-SSE: # BB#0: 1596 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1597 ; X32-SSE-NEXT: retl 1598 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0) 1599 ret <16 x i8> %out 1600 } 1601 1602 define <16 x i8> @foldv16i8u() nounwind { 1603 ; SSE-LABEL: foldv16i8u: 1604 ; SSE: # BB#0: 1605 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1606 ; SSE-NEXT: retq 1607 ; 1608 ; AVX1-LABEL: foldv16i8u: 1609 ; AVX1: # BB#0: 1610 ; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1611 ; AVX1-NEXT: retq 1612 ; 1613 ; AVX2-LABEL: foldv16i8u: 1614 ; AVX2: # BB#0: 1615 ; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1616 ; AVX2-NEXT: retq 1617 ; 1618 ; AVX512CDVL-LABEL: foldv16i8u: 1619 ; AVX512CDVL: # BB#0: 1620 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1621 ; AVX512CDVL-NEXT: retq 1622 ; 1623 ; AVX512CD-LABEL: foldv16i8u: 1624 ; AVX512CD: # BB#0: 1625 ; AVX512CD-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1626 ; AVX512CD-NEXT: retq 1627 ; 1628 ; X32-SSE-LABEL: foldv16i8u: 1629 ; X32-SSE: # BB#0: 1630 ; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] 1631 ; X32-SSE-NEXT: retl 1632 %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1) 1633 ret <16 x i8> %out 1634 } 1635 1636 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) 1637 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) 1638 declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1) 1639 declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1) 1640