1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512VL 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD 8 ; 9 ; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. 10 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX 11 12 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { 13 ; AVX1-LABEL: testv4i64: 14 ; AVX1: # %bb.0: 15 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 16 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 17 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 18 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 19 ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm5 20 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 21 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm6 22 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 23 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7 24 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 25 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 26 ; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 27 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm6 28 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 29 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 30 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 31 ; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5 32 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm6 33 ; AVX1-NEXT: vpsrld $16, %xmm6, %xmm6 34 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 35 ; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 36 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 37 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm2 38 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 39 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 40 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 41 ; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 42 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm5 43 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 44 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm6 45 ; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 46 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm6 47 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 48 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 49 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 50 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm4 51 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 52 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 53 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 54 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 55 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm4 56 ; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4 57 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 58 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 59 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 60 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 61 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 62 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 63 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm1 64 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 65 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 66 ; AVX1-NEXT: retq 67 ; 68 ; AVX2-LABEL: testv4i64: 69 ; AVX2: # %bb.0: 70 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 71 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 72 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 73 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 74 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 75 ; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 76 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 77 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 78 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 79 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 80 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 81 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 82 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 83 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 84 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 85 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 86 ; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 87 ; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 88 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 89 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 90 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 91 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 92 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 93 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 94 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 95 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 96 ; AVX2-NEXT: retq 97 ; 98 ; AVX512VL-LABEL: testv4i64: 99 ; AVX512VL: # %bb.0: 100 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 101 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 102 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 103 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 104 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 105 ; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 106 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 107 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 108 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 109 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 110 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 111 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 112 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 113 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 114 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 115 ; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 116 ; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 117 ; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2 118 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 119 ; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 120 ; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1 121 ; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 122 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 123 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 124 ; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1 125 ; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 126 ; AVX512VL-NEXT: retq 127 ; 128 ; AVX512VLBWDQ-LABEL: testv4i64: 129 ; AVX512VLBWDQ: # %bb.0: 130 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 131 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 132 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 133 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 134 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 135 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 136 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 137 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 138 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 139 ; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 140 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 141 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 142 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 143 ; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 144 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 145 ; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 146 ; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 147 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2 148 ; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 149 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 150 ; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1 151 ; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 152 ; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0 153 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 154 ; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1 155 ; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 156 ; AVX512VLBWDQ-NEXT: retq 157 ; 158 ; AVX512VLCD-LABEL: testv4i64: 159 ; AVX512VLCD: # %bb.0: 160 ; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 161 ; AVX512VLCD-NEXT: retq 162 ; 163 ; AVX512CD-LABEL: testv4i64: 164 ; AVX512CD: # %bb.0: 165 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 166 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 167 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 168 ; AVX512CD-NEXT: retq 169 ; 170 ; X32-AVX-LABEL: testv4i64: 171 ; X32-AVX: # %bb.0: 172 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 173 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 174 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 175 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 176 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 177 ; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 178 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 179 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 180 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 181 ; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 182 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 183 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 184 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 185 ; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 186 ; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 187 ; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 188 ; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 189 ; X32-AVX-NEXT: vpsrld $16, %ymm2, %ymm2 190 ; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 191 ; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 192 ; X32-AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 193 ; X32-AVX-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 194 ; X32-AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 195 ; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 196 ; X32-AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 197 ; X32-AVX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 198 ; X32-AVX-NEXT: retl 199 200 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0) 201 ret <4 x i64> %out 202 } 203 204 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { 205 ; AVX1-LABEL: testv4i64u: 206 ; AVX1: # %bb.0: 207 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 208 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 209 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1 210 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 211 ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm5 212 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 213 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm6 214 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 215 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm6, %xmm7 216 ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 217 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm6 218 ; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 219 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm6 220 ; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 221 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 222 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 223 ; AVX1-NEXT: vpaddw %xmm6, %xmm5, %xmm5 224 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm6 225 ; AVX1-NEXT: vpsrld $16, %xmm6, %xmm6 226 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm6 227 ; AVX1-NEXT: vpsrld $16, %xmm5, %xmm5 228 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 229 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm2 230 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 231 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 232 ; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm5 233 ; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 234 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm5 235 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 236 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm6 237 ; AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3 238 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm3, %xmm6 239 ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 240 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 241 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 242 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm4 243 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 244 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 245 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 246 ; AVX1-NEXT: vpaddw %xmm4, %xmm3, %xmm3 247 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm4 248 ; AVX1-NEXT: vpsrld $16, %xmm4, %xmm4 249 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 250 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 251 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 252 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 253 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 254 ; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 255 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm1 256 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 257 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 258 ; AVX1-NEXT: retq 259 ; 260 ; AVX2-LABEL: testv4i64u: 261 ; AVX2: # %bb.0: 262 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 263 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 264 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 265 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 266 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 267 ; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 268 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 269 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 270 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 271 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 272 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 273 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 274 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 275 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 276 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 277 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 278 ; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 279 ; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 280 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 281 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 282 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 283 ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 284 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 285 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 286 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 287 ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 288 ; AVX2-NEXT: retq 289 ; 290 ; AVX512VL-LABEL: testv4i64u: 291 ; AVX512VL: # %bb.0: 292 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 293 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 294 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 295 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 296 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 297 ; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 298 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 299 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 300 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 301 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 302 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 303 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 304 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 305 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 306 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 307 ; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 308 ; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 309 ; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2 310 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 311 ; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 312 ; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1 313 ; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 314 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 315 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 316 ; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1 317 ; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 318 ; AVX512VL-NEXT: retq 319 ; 320 ; AVX512VLBWDQ-LABEL: testv4i64u: 321 ; AVX512VLBWDQ: # %bb.0: 322 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 323 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 324 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 325 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 326 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 327 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 328 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 329 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 330 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 331 ; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 332 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 333 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 334 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 335 ; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 336 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 337 ; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 338 ; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 339 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2 340 ; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 341 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 342 ; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1 343 ; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 344 ; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0 345 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 346 ; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1 347 ; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 348 ; AVX512VLBWDQ-NEXT: retq 349 ; 350 ; AVX512VLCD-LABEL: testv4i64u: 351 ; AVX512VLCD: # %bb.0: 352 ; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 353 ; AVX512VLCD-NEXT: retq 354 ; 355 ; AVX512CD-LABEL: testv4i64u: 356 ; AVX512CD: # %bb.0: 357 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 358 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 359 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 360 ; AVX512CD-NEXT: retq 361 ; 362 ; X32-AVX-LABEL: testv4i64u: 363 ; X32-AVX: # %bb.0: 364 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 365 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 366 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 367 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 368 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 369 ; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 370 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 371 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 372 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 373 ; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 374 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 375 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 376 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 377 ; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 378 ; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 379 ; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 380 ; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 381 ; X32-AVX-NEXT: vpsrld $16, %ymm2, %ymm2 382 ; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 383 ; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 384 ; X32-AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 385 ; X32-AVX-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 386 ; X32-AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 387 ; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 388 ; X32-AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 389 ; X32-AVX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 390 ; X32-AVX-NEXT: retl 391 392 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1) 393 ret <4 x i64> %out 394 } 395 396 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { 397 ; AVX1-LABEL: testv8i32: 398 ; AVX1: # %bb.0: 399 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 400 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 401 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 402 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 403 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 404 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 405 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 406 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 407 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 408 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 409 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 410 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 411 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm5 412 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 413 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm5 414 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 415 ; AVX1-NEXT: vpaddw %xmm5, %xmm3, %xmm3 416 ; AVX1-NEXT: vpcmpeqw %xmm6, %xmm1, %xmm1 417 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 418 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 419 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 420 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 421 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 422 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 423 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 424 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 425 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 426 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 427 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 428 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 429 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm3 430 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 431 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm3 432 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 433 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 434 ; AVX1-NEXT: vpcmpeqw %xmm6, %xmm0, %xmm0 435 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 436 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 437 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 438 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 439 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 440 ; AVX1-NEXT: retq 441 ; 442 ; AVX2-LABEL: testv8i32: 443 ; AVX2: # %bb.0: 444 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 445 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 446 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 447 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 448 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 449 ; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 450 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 451 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 452 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 453 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 454 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 455 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 456 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 457 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 458 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 459 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 460 ; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 461 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 462 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 463 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 464 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 465 ; AVX2-NEXT: retq 466 ; 467 ; AVX512VL-LABEL: testv8i32: 468 ; AVX512VL: # %bb.0: 469 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 470 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 471 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 472 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 473 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 474 ; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 475 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 476 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 477 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 478 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 479 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 480 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 481 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 482 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 483 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 484 ; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 485 ; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 486 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 487 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 488 ; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 489 ; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 490 ; AVX512VL-NEXT: retq 491 ; 492 ; AVX512VLBWDQ-LABEL: testv8i32: 493 ; AVX512VLBWDQ: # %bb.0: 494 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 495 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 496 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 497 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 498 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 499 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 500 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 501 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 502 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 503 ; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 504 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 505 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 506 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 507 ; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 508 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 509 ; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 510 ; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 511 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0 512 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 513 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 514 ; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0 515 ; AVX512VLBWDQ-NEXT: retq 516 ; 517 ; AVX512VLCD-LABEL: testv8i32: 518 ; AVX512VLCD: # %bb.0: 519 ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 520 ; AVX512VLCD-NEXT: retq 521 ; 522 ; AVX512CD-LABEL: testv8i32: 523 ; AVX512CD: # %bb.0: 524 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 525 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 526 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 527 ; AVX512CD-NEXT: retq 528 ; 529 ; X32-AVX-LABEL: testv8i32: 530 ; X32-AVX: # %bb.0: 531 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 532 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 533 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 534 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 535 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 536 ; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 537 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 538 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 539 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 540 ; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 541 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 542 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 543 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 544 ; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 545 ; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 546 ; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 547 ; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 548 ; X32-AVX-NEXT: vpsrld $16, %ymm0, %ymm0 549 ; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 550 ; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 551 ; X32-AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 552 ; X32-AVX-NEXT: retl 553 554 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0) 555 ret <8 x i32> %out 556 } 557 558 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { 559 ; AVX1-LABEL: testv8i32u: 560 ; AVX1: # %bb.0: 561 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 562 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 563 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 564 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 565 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 566 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 567 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 568 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 569 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 570 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 571 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 572 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 573 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm5 574 ; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 575 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm5 576 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 577 ; AVX1-NEXT: vpaddw %xmm5, %xmm3, %xmm3 578 ; AVX1-NEXT: vpcmpeqw %xmm6, %xmm1, %xmm1 579 ; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 580 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 581 ; AVX1-NEXT: vpsrld $16, %xmm3, %xmm3 582 ; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 583 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 584 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 585 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 586 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 587 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 588 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 589 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 590 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 591 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm3 592 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 593 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm3 594 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 595 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 596 ; AVX1-NEXT: vpcmpeqw %xmm6, %xmm0, %xmm0 597 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 598 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 599 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 600 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 601 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 602 ; AVX1-NEXT: retq 603 ; 604 ; AVX2-LABEL: testv8i32u: 605 ; AVX2: # %bb.0: 606 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 607 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 608 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 609 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 610 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 611 ; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 612 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 613 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 614 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 615 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 616 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 617 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 618 ; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 619 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm2 620 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 621 ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 622 ; AVX2-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 623 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 624 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 625 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 626 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 627 ; AVX2-NEXT: retq 628 ; 629 ; AVX512VL-LABEL: testv8i32u: 630 ; AVX512VL: # %bb.0: 631 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 632 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 633 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 634 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 635 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 636 ; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 637 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 638 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 639 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 640 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 641 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 642 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 643 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 644 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 645 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 646 ; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 647 ; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 648 ; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 649 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 650 ; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 651 ; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 652 ; AVX512VL-NEXT: retq 653 ; 654 ; AVX512VLBWDQ-LABEL: testv8i32u: 655 ; AVX512VLBWDQ: # %bb.0: 656 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 657 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 658 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 659 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 660 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 661 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 662 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 663 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 664 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 665 ; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 666 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 667 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 668 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 669 ; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 670 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 671 ; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 672 ; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 673 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0 674 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 675 ; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 676 ; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0 677 ; AVX512VLBWDQ-NEXT: retq 678 ; 679 ; AVX512VLCD-LABEL: testv8i32u: 680 ; AVX512VLCD: # %bb.0: 681 ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 682 ; AVX512VLCD-NEXT: retq 683 ; 684 ; AVX512CD-LABEL: testv8i32u: 685 ; AVX512CD: # %bb.0: 686 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 687 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 688 ; AVX512CD-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 689 ; AVX512CD-NEXT: retq 690 ; 691 ; X32-AVX-LABEL: testv8i32u: 692 ; X32-AVX: # %bb.0: 693 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 694 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 695 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 696 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 697 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 698 ; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 699 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 700 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 701 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 702 ; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 703 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 704 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 705 ; X32-AVX-NEXT: vpsrlw $8, %ymm2, %ymm2 706 ; X32-AVX-NEXT: vpand %ymm2, %ymm1, %ymm2 707 ; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 708 ; X32-AVX-NEXT: vpaddw %ymm2, %ymm1, %ymm1 709 ; X32-AVX-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 710 ; X32-AVX-NEXT: vpsrld $16, %ymm0, %ymm0 711 ; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 712 ; X32-AVX-NEXT: vpsrld $16, %ymm1, %ymm1 713 ; X32-AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 714 ; X32-AVX-NEXT: retl 715 716 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1) 717 ret <8 x i32> %out 718 } 719 720 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { 721 ; AVX1-LABEL: testv16i16: 722 ; AVX1: # %bb.0: 723 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 724 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 725 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 726 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 727 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 728 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 729 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 730 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 731 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 732 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 733 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 734 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 735 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 736 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 737 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 738 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 739 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 740 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 741 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 742 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 743 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 744 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 745 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 746 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 747 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 748 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 749 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 750 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 751 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 752 ; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 753 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 754 ; AVX1-NEXT: retq 755 ; 756 ; AVX2-LABEL: testv16i16: 757 ; AVX2: # %bb.0: 758 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 759 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 760 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 761 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 762 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 763 ; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 764 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 765 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 766 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 767 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 768 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 769 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 770 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 771 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 772 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 773 ; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 774 ; AVX2-NEXT: retq 775 ; 776 ; AVX512VL-LABEL: testv16i16: 777 ; AVX512VL: # %bb.0: 778 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 779 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 780 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 781 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 782 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 783 ; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 784 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 785 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 786 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 787 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 788 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 789 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 790 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 791 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 792 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 793 ; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0 794 ; AVX512VL-NEXT: retq 795 ; 796 ; AVX512VLBWDQ-LABEL: testv16i16: 797 ; AVX512VLBWDQ: # %bb.0: 798 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 799 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 800 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 801 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 802 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 803 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 804 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 805 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 806 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 807 ; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 808 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 809 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 810 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 811 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 812 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 813 ; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0 814 ; AVX512VLBWDQ-NEXT: retq 815 ; 816 ; AVX512-LABEL: testv16i16: 817 ; AVX512: # %bb.0: 818 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 819 ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 820 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 821 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 822 ; AVX512-NEXT: retq 823 ; 824 ; X32-AVX-LABEL: testv16i16: 825 ; X32-AVX: # %bb.0: 826 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 827 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 828 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 829 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 830 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 831 ; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 832 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 833 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 834 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 835 ; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 836 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 837 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 838 ; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 839 ; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 840 ; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 841 ; X32-AVX-NEXT: vpaddw %ymm0, %ymm1, %ymm0 842 ; X32-AVX-NEXT: retl 843 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0) 844 ret <16 x i16> %out 845 } 846 847 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { 848 ; AVX1-LABEL: testv16i16u: 849 ; AVX1: # %bb.0: 850 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 851 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 852 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 853 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 854 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 855 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 856 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 857 ; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 858 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm5, %xmm7 859 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 860 ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 861 ; AVX1-NEXT: vpaddb %xmm5, %xmm3, %xmm3 862 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm1, %xmm1 863 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 864 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 865 ; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 866 ; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 867 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 868 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 869 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm5 870 ; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 871 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm5 872 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 873 ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm2 874 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 875 ; AVX1-NEXT: vpcmpeqb %xmm6, %xmm0, %xmm0 876 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 877 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 878 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 879 ; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 880 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 881 ; AVX1-NEXT: retq 882 ; 883 ; AVX2-LABEL: testv16i16u: 884 ; AVX2: # %bb.0: 885 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 886 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 887 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 888 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 889 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4 890 ; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 891 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 892 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 893 ; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 894 ; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 895 ; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 896 ; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 897 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 898 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 899 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 900 ; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 901 ; AVX2-NEXT: retq 902 ; 903 ; AVX512VL-LABEL: testv16i16u: 904 ; AVX512VL: # %bb.0: 905 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 906 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 907 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 908 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 909 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 910 ; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 911 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 912 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 913 ; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 914 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 915 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 916 ; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 917 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 918 ; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 919 ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 920 ; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0 921 ; AVX512VL-NEXT: retq 922 ; 923 ; AVX512VLBWDQ-LABEL: testv16i16u: 924 ; AVX512VLBWDQ: # %bb.0: 925 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 926 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 927 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 928 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 929 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 930 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 931 ; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 932 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 933 ; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 934 ; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 935 ; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 936 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 937 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 938 ; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 939 ; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 940 ; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0 941 ; AVX512VLBWDQ-NEXT: retq 942 ; 943 ; AVX512-LABEL: testv16i16u: 944 ; AVX512: # %bb.0: 945 ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 946 ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 947 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 948 ; AVX512-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 949 ; AVX512-NEXT: retq 950 ; 951 ; X32-AVX-LABEL: testv16i16u: 952 ; X32-AVX: # %bb.0: 953 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 954 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 955 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 956 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 957 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4 958 ; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1 959 ; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 960 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 961 ; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2 962 ; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1 963 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm2, %ymm1 964 ; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 965 ; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 966 ; X32-AVX-NEXT: vpand %ymm0, %ymm1, %ymm0 967 ; X32-AVX-NEXT: vpsrlw $8, %ymm1, %ymm1 968 ; X32-AVX-NEXT: vpaddw %ymm0, %ymm1, %ymm0 969 ; X32-AVX-NEXT: retl 970 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1) 971 ret <16 x i16> %out 972 } 973 974 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { 975 ; AVX1-LABEL: testv32i8: 976 ; AVX1: # %bb.0: 977 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 978 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 979 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 980 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 981 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 982 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 983 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 984 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 985 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 986 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 987 ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 988 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 989 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 990 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 991 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 992 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 993 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 994 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 995 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 996 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 997 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 998 ; AVX1-NEXT: retq 999 ; 1000 ; AVX2-LABEL: testv32i8: 1001 ; AVX2: # %bb.0: 1002 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1003 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1004 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1005 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1006 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1007 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1008 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1009 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1010 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 1011 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1012 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1013 ; AVX2-NEXT: retq 1014 ; 1015 ; AVX512VL-LABEL: testv32i8: 1016 ; AVX512VL: # %bb.0: 1017 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1018 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 1019 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1020 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1021 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 1022 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 1023 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1024 ; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1025 ; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 1026 ; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1027 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1028 ; AVX512VL-NEXT: retq 1029 ; 1030 ; AVX512VLBWDQ-LABEL: testv32i8: 1031 ; AVX512VLBWDQ: # %bb.0: 1032 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1033 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 1034 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1035 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1036 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 1037 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 1038 ; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 1039 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1040 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 1041 ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1042 ; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1043 ; AVX512VLBWDQ-NEXT: retq 1044 ; 1045 ; AVX512-LABEL: testv32i8: 1046 ; AVX512: # %bb.0: 1047 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1048 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 1049 ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 1050 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 1051 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] 1052 ; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 1053 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1054 ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 1055 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1056 ; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 1057 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1058 ; AVX512-NEXT: retq 1059 ; 1060 ; X32-AVX-LABEL: testv32i8: 1061 ; X32-AVX: # %bb.0: 1062 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1063 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 1064 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1065 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1066 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 1067 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 1068 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1069 ; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1070 ; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1 1071 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1072 ; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1073 ; X32-AVX-NEXT: retl 1074 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0) 1075 ret <32 x i8> %out 1076 } 1077 1078 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { 1079 ; AVX1-LABEL: testv32i8u: 1080 ; AVX1: # %bb.0: 1081 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1082 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1083 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 1084 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1085 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1086 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1087 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1088 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 1089 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm6 1090 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 1091 ; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 1092 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 1093 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 1094 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1095 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1096 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1097 ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm2 1098 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 1099 ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 1100 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 1101 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1102 ; AVX1-NEXT: retq 1103 ; 1104 ; AVX2-LABEL: testv32i8u: 1105 ; AVX2: # %bb.0: 1106 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1107 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1108 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1109 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1110 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1111 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1112 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1113 ; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1114 ; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 1115 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1116 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1117 ; AVX2-NEXT: retq 1118 ; 1119 ; AVX512VL-LABEL: testv32i8u: 1120 ; AVX512VL: # %bb.0: 1121 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1122 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 1123 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1124 ; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1125 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 1126 ; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 1127 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1128 ; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1129 ; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 1130 ; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1131 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1132 ; AVX512VL-NEXT: retq 1133 ; 1134 ; AVX512VLBWDQ-LABEL: testv32i8u: 1135 ; AVX512VLBWDQ: # %bb.0: 1136 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1137 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 1138 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1139 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1140 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 1141 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 1142 ; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 1143 ; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1144 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 1145 ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1146 ; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1147 ; AVX512VLBWDQ-NEXT: retq 1148 ; 1149 ; AVX512-LABEL: testv32i8u: 1150 ; AVX512: # %bb.0: 1151 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1152 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 1153 ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 1154 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 1155 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] 1156 ; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1 1157 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1158 ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 1159 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1160 ; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0 1161 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1162 ; AVX512-NEXT: retq 1163 ; 1164 ; X32-AVX-LABEL: testv32i8u: 1165 ; X32-AVX: # %bb.0: 1166 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1167 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 1168 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] 1169 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1170 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 1171 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 1172 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1173 ; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 1174 ; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1 1175 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1176 ; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1177 ; X32-AVX-NEXT: retl 1178 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1) 1179 ret <32 x i8> %out 1180 } 1181 1182 define <4 x i64> @foldv4i64() nounwind { 1183 ; X64-LABEL: foldv4i64: 1184 ; X64: # %bb.0: 1185 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] 1186 ; X64-NEXT: retq 1187 ; 1188 ; X32-AVX-LABEL: foldv4i64: 1189 ; X32-AVX: # %bb.0: 1190 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] 1191 ; X32-AVX-NEXT: retl 1192 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) 1193 ret <4 x i64> %out 1194 } 1195 1196 define <4 x i64> @foldv4i64u() nounwind { 1197 ; X64-LABEL: foldv4i64u: 1198 ; X64: # %bb.0: 1199 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] 1200 ; X64-NEXT: retq 1201 ; 1202 ; X32-AVX-LABEL: foldv4i64u: 1203 ; X32-AVX: # %bb.0: 1204 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0] 1205 ; X32-AVX-NEXT: retl 1206 %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) 1207 ret <4 x i64> %out 1208 } 1209 1210 define <8 x i32> @foldv8i32() nounwind { 1211 ; X64-LABEL: foldv8i32: 1212 ; X64: # %bb.0: 1213 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 1214 ; X64-NEXT: retq 1215 ; 1216 ; X32-AVX-LABEL: foldv8i32: 1217 ; X32-AVX: # %bb.0: 1218 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 1219 ; X32-AVX-NEXT: retl 1220 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) 1221 ret <8 x i32> %out 1222 } 1223 1224 define <8 x i32> @foldv8i32u() nounwind { 1225 ; X64-LABEL: foldv8i32u: 1226 ; X64: # %bb.0: 1227 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 1228 ; X64-NEXT: retq 1229 ; 1230 ; X32-AVX-LABEL: foldv8i32u: 1231 ; X32-AVX: # %bb.0: 1232 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] 1233 ; X32-AVX-NEXT: retl 1234 %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) 1235 ret <8 x i32> %out 1236 } 1237 1238 define <16 x i16> @foldv16i16() nounwind { 1239 ; X64-LABEL: foldv16i16: 1240 ; X64: # %bb.0: 1241 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 1242 ; X64-NEXT: retq 1243 ; 1244 ; X32-AVX-LABEL: foldv16i16: 1245 ; X32-AVX: # %bb.0: 1246 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 1247 ; X32-AVX-NEXT: retl 1248 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) 1249 ret <16 x i16> %out 1250 } 1251 1252 define <16 x i16> @foldv16i16u() nounwind { 1253 ; X64-LABEL: foldv16i16u: 1254 ; X64: # %bb.0: 1255 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 1256 ; X64-NEXT: retq 1257 ; 1258 ; X32-AVX-LABEL: foldv16i16u: 1259 ; X32-AVX: # %bb.0: 1260 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] 1261 ; X32-AVX-NEXT: retl 1262 %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) 1263 ret <16 x i16> %out 1264 } 1265 1266 define <32 x i8> @foldv32i8() nounwind { 1267 ; X64-LABEL: foldv32i8: 1268 ; X64: # %bb.0: 1269 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 1270 ; X64-NEXT: retq 1271 ; 1272 ; X32-AVX-LABEL: foldv32i8: 1273 ; X32-AVX: # %bb.0: 1274 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 1275 ; X32-AVX-NEXT: retl 1276 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) 1277 ret <32 x i8> %out 1278 } 1279 1280 define <32 x i8> @foldv32i8u() nounwind { 1281 ; X64-LABEL: foldv32i8u: 1282 ; X64: # %bb.0: 1283 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 1284 ; X64-NEXT: retq 1285 ; 1286 ; X32-AVX-LABEL: foldv32i8u: 1287 ; X32-AVX: # %bb.0: 1288 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] 1289 ; X32-AVX-NEXT: retl 1290 %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) 1291 ret <32 x i8> %out 1292 } 1293 1294 declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) 1295 declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) 1296 declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1) 1297 declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1) 1298