1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG 10 ; 11 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt. 12 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 13 14 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { 15 ; AVX1-LABEL: testv4i64: 16 ; AVX1: # %bb.0: 17 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 18 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 19 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 20 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 21 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 22 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 23 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 24 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 25 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 26 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 27 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 28 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 29 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 30 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 31 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 32 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 33 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 34 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 35 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 36 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 37 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 38 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 39 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 40 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 41 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 42 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 43 ; AVX1-NEXT: retq 44 ; 45 ; AVX2-LABEL: testv4i64: 46 ; AVX2: # %bb.0: 47 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 48 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 49 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 50 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 51 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 52 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 53 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 54 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 55 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 56 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 57 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 58 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 59 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 60 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 61 ; AVX2-NEXT: retq 62 ; 63 ; AVX512CDVL-LABEL: testv4i64: 64 ; AVX512CDVL: # %bb.0: 65 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 66 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2 67 ; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 68 ; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 69 ; AVX512CDVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 70 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 71 ; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3 72 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 73 ; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 74 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 75 ; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 76 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 77 ; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 78 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 79 ; AVX512CDVL-NEXT: retq 80 ; 81 ; AVX512CD-LABEL: testv4i64: 82 ; AVX512CD: # %bb.0: 83 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 84 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2 85 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 86 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 87 ; AVX512CD-NEXT: vpaddq %ymm2, %ymm0, %ymm0 88 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 89 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 90 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 91 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 92 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 93 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 94 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 95 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 96 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 97 ; AVX512CD-NEXT: retq 98 ; 99 ; AVX512VPOPCNTDQ-LABEL: testv4i64: 100 ; AVX512VPOPCNTDQ: # %bb.0: 101 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 102 ; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 103 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 104 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 105 ; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0 106 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 107 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 108 ; AVX512VPOPCNTDQ-NEXT: retq 109 ; 110 ; AVX512VPOPCNTDQVL-LABEL: testv4i64: 111 ; AVX512VPOPCNTDQVL: # %bb.0: 112 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 113 ; AVX512VPOPCNTDQVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 114 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 115 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 116 ; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 117 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 118 ; AVX512VPOPCNTDQVL-NEXT: retq 119 ; 120 ; BITALG_NOVLX-LABEL: testv4i64: 121 ; BITALG_NOVLX: # %bb.0: 122 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 123 ; BITALG_NOVLX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 124 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 125 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 126 ; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 127 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 128 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 129 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 130 ; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 131 ; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 132 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 133 ; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 134 ; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 135 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 136 ; BITALG_NOVLX-NEXT: retq 137 ; 138 ; BITALG-LABEL: testv4i64: 139 ; BITALG: # %bb.0: 140 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 141 ; BITALG-NEXT: vpsubq %ymm0, %ymm1, %ymm2 142 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 143 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 144 ; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 145 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 146 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 147 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 148 ; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 149 ; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 150 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 151 ; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 152 ; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 153 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 154 ; BITALG-NEXT: retq 155 ; 156 ; X32-AVX-LABEL: testv4i64: 157 ; X32-AVX: # %bb.0: 158 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 159 ; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 160 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 161 ; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 162 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 163 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 164 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 165 ; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 166 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 167 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 168 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 169 ; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 170 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 171 ; X32-AVX-NEXT: retl 172 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) 173 ret <4 x i64> %out 174 } 175 176 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { 177 ; AVX1-LABEL: testv4i64u: 178 ; AVX1: # %bb.0: 179 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 180 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 181 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 182 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 183 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 184 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 185 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 186 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 187 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 188 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 189 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 190 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 191 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 192 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 193 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 194 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 195 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 196 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 197 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 198 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 199 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 200 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 201 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 202 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 203 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 204 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 205 ; AVX1-NEXT: retq 206 ; 207 ; AVX2-LABEL: testv4i64u: 208 ; AVX2: # %bb.0: 209 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 210 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 211 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 212 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 213 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 214 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 215 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 216 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 217 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 218 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 219 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 220 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 221 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 222 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 223 ; AVX2-NEXT: retq 224 ; 225 ; AVX512CDVL-LABEL: testv4i64u: 226 ; AVX512CDVL: # %bb.0: 227 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 228 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 229 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 230 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 231 ; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] 232 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 233 ; AVX512CDVL-NEXT: retq 234 ; 235 ; AVX512CD-LABEL: testv4i64u: 236 ; AVX512CD: # %bb.0: 237 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 238 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1 239 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 240 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 241 ; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] 242 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 243 ; AVX512CD-NEXT: retq 244 ; 245 ; AVX512VPOPCNTDQ-LABEL: testv4i64u: 246 ; AVX512VPOPCNTDQ: # %bb.0: 247 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 248 ; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 249 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 250 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 251 ; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0 252 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 253 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 254 ; AVX512VPOPCNTDQ-NEXT: retq 255 ; 256 ; AVX512VPOPCNTDQVL-LABEL: testv4i64u: 257 ; AVX512VPOPCNTDQVL: # %bb.0: 258 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 259 ; AVX512VPOPCNTDQVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 260 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 261 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 262 ; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 263 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 264 ; AVX512VPOPCNTDQVL-NEXT: retq 265 ; 266 ; BITALG_NOVLX-LABEL: testv4i64u: 267 ; BITALG_NOVLX: # %bb.0: 268 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 269 ; BITALG_NOVLX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 270 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 271 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 272 ; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 273 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 274 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 275 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 276 ; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 277 ; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 278 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 279 ; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 280 ; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 281 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 282 ; BITALG_NOVLX-NEXT: retq 283 ; 284 ; BITALG-LABEL: testv4i64u: 285 ; BITALG: # %bb.0: 286 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 287 ; BITALG-NEXT: vpsubq %ymm0, %ymm1, %ymm2 288 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 289 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 290 ; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 291 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 292 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 293 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 294 ; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 295 ; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 296 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 297 ; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 298 ; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 299 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 300 ; BITALG-NEXT: retq 301 ; 302 ; X32-AVX-LABEL: testv4i64u: 303 ; X32-AVX: # %bb.0: 304 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 305 ; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 306 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 307 ; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 308 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 309 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 310 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 311 ; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 312 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 313 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 314 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 315 ; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 316 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 317 ; X32-AVX-NEXT: retl 318 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) 319 ret <4 x i64> %out 320 } 321 322 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { 323 ; AVX1-LABEL: testv8i32: 324 ; AVX1: # %bb.0: 325 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 326 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 327 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 328 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 329 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 330 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 331 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 332 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 333 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 334 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 335 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 336 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 337 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 338 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 339 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 340 ; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 341 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 342 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 343 ; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 344 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 345 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 346 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 347 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 348 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 349 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 350 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 351 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 352 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 353 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 354 ; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 355 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 356 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 357 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 358 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 359 ; AVX1-NEXT: retq 360 ; 361 ; AVX2-LABEL: testv8i32: 362 ; AVX2: # %bb.0: 363 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 364 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 365 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 366 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 367 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 368 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 369 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 370 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 371 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 372 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 373 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 374 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 375 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 376 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 377 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 378 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 379 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 380 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 381 ; AVX2-NEXT: retq 382 ; 383 ; AVX512CDVL-LABEL: testv8i32: 384 ; AVX512CDVL: # %bb.0: 385 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 386 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2 387 ; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 388 ; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 389 ; AVX512CDVL-NEXT: vpaddd %ymm2, %ymm0, %ymm0 390 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 391 ; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3 392 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 393 ; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 394 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 395 ; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 396 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 397 ; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 398 ; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 399 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 400 ; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 401 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 402 ; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 403 ; AVX512CDVL-NEXT: retq 404 ; 405 ; AVX512CD-LABEL: testv8i32: 406 ; AVX512CD: # %bb.0: 407 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 408 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2 409 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 410 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 411 ; AVX512CD-NEXT: vpaddd %ymm2, %ymm0, %ymm0 412 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 413 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 414 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 415 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 416 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 417 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 418 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 419 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 420 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 421 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 422 ; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 423 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 424 ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 425 ; AVX512CD-NEXT: retq 426 ; 427 ; AVX512VPOPCNTDQ-LABEL: testv8i32: 428 ; AVX512VPOPCNTDQ: # %bb.0: 429 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 430 ; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 431 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 432 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 433 ; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0 434 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 435 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 436 ; AVX512VPOPCNTDQ-NEXT: retq 437 ; 438 ; AVX512VPOPCNTDQVL-LABEL: testv8i32: 439 ; AVX512VPOPCNTDQVL: # %bb.0: 440 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 441 ; AVX512VPOPCNTDQVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 442 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 443 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 444 ; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 445 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 446 ; AVX512VPOPCNTDQVL-NEXT: retq 447 ; 448 ; BITALG_NOVLX-LABEL: testv8i32: 449 ; BITALG_NOVLX: # %bb.0: 450 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 451 ; BITALG_NOVLX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 452 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 453 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 454 ; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 455 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 456 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 457 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 458 ; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 459 ; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 460 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 461 ; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 462 ; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 463 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 464 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 465 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 466 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 467 ; BITALG_NOVLX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 468 ; BITALG_NOVLX-NEXT: retq 469 ; 470 ; BITALG-LABEL: testv8i32: 471 ; BITALG: # %bb.0: 472 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 473 ; BITALG-NEXT: vpsubd %ymm0, %ymm1, %ymm2 474 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 475 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 476 ; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 477 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 478 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 479 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 480 ; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 481 ; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 482 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 483 ; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 484 ; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 485 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 486 ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 487 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 488 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 489 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 490 ; BITALG-NEXT: retq 491 ; 492 ; X32-AVX-LABEL: testv8i32: 493 ; X32-AVX: # %bb.0: 494 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 495 ; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 496 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 497 ; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 498 ; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 499 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 500 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 501 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 502 ; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 503 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 504 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 505 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 506 ; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 507 ; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 508 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 509 ; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 510 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 511 ; X32-AVX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 512 ; X32-AVX-NEXT: retl 513 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) 514 ret <8 x i32> %out 515 } 516 517 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { 518 ; AVX1-LABEL: testv8i32u: 519 ; AVX1: # %bb.0: 520 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 521 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 522 ; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 523 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 524 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 525 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 526 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 527 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 528 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 529 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 530 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 531 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 532 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 533 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 534 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 535 ; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 536 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 537 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 538 ; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 539 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 540 ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 541 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 542 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 543 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 544 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 545 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 546 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 547 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 548 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 549 ; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 550 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 551 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 552 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 553 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 554 ; AVX1-NEXT: retq 555 ; 556 ; AVX2-LABEL: testv8i32u: 557 ; AVX2: # %bb.0: 558 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 559 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 560 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 561 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 562 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 563 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 564 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 565 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 566 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 567 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 568 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 569 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 570 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 571 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 572 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 573 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 574 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 575 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 576 ; AVX2-NEXT: retq 577 ; 578 ; AVX512CDVL-LABEL: testv8i32u: 579 ; AVX512CDVL: # %bb.0: 580 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 581 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 582 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 583 ; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 584 ; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] 585 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 586 ; AVX512CDVL-NEXT: retq 587 ; 588 ; AVX512CD-LABEL: testv8i32u: 589 ; AVX512CD: # %bb.0: 590 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 591 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1 592 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 593 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 594 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] 595 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 596 ; AVX512CD-NEXT: retq 597 ; 598 ; AVX512VPOPCNTDQ-LABEL: testv8i32u: 599 ; AVX512VPOPCNTDQ: # %bb.0: 600 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 601 ; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 602 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 603 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 604 ; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0 605 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 606 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 607 ; AVX512VPOPCNTDQ-NEXT: retq 608 ; 609 ; AVX512VPOPCNTDQVL-LABEL: testv8i32u: 610 ; AVX512VPOPCNTDQVL: # %bb.0: 611 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 612 ; AVX512VPOPCNTDQVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 613 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 614 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 615 ; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 616 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 617 ; AVX512VPOPCNTDQVL-NEXT: retq 618 ; 619 ; BITALG_NOVLX-LABEL: testv8i32u: 620 ; BITALG_NOVLX: # %bb.0: 621 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 622 ; BITALG_NOVLX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 623 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 624 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 625 ; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 626 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 627 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 628 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 629 ; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 630 ; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 631 ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 632 ; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 633 ; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 634 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 635 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 636 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 637 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 638 ; BITALG_NOVLX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 639 ; BITALG_NOVLX-NEXT: retq 640 ; 641 ; BITALG-LABEL: testv8i32u: 642 ; BITALG: # %bb.0: 643 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 644 ; BITALG-NEXT: vpsubd %ymm0, %ymm1, %ymm2 645 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 646 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 647 ; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 648 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 649 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 650 ; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 651 ; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 652 ; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 653 ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 654 ; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 655 ; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 656 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 657 ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 658 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 659 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 660 ; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 661 ; BITALG-NEXT: retq 662 ; 663 ; X32-AVX-LABEL: testv8i32u: 664 ; X32-AVX: # %bb.0: 665 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 666 ; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 667 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 668 ; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 669 ; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 670 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 671 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 672 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 673 ; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 674 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 675 ; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 676 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 677 ; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 678 ; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 679 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 680 ; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 681 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 682 ; X32-AVX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 683 ; X32-AVX-NEXT: retl 684 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1) 685 ret <8 x i32> %out 686 } 687 688 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { 689 ; AVX1-LABEL: testv16i16: 690 ; AVX1: # %bb.0: 691 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 692 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 693 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 694 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 695 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 696 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 697 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 698 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 699 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 700 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 701 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 702 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 703 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 704 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 705 ; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 706 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 707 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 708 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 709 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 710 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 711 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 712 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 713 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 714 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 715 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 716 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 717 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 718 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 719 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 720 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 721 ; AVX1-NEXT: retq 722 ; 723 ; AVX2-LABEL: testv16i16: 724 ; AVX2: # %bb.0: 725 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 726 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 727 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 728 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 729 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 730 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 731 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 732 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 733 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 734 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 735 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 736 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 737 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 738 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 739 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 740 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 741 ; AVX2-NEXT: retq 742 ; 743 ; AVX512CDVL-LABEL: testv16i16: 744 ; AVX512CDVL: # %bb.0: 745 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 746 ; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 747 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 748 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 749 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 750 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 751 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 752 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 753 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 754 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 755 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 756 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 757 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 758 ; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1 759 ; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 760 ; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0 761 ; AVX512CDVL-NEXT: retq 762 ; 763 ; AVX512CD-LABEL: testv16i16: 764 ; AVX512CD: # %bb.0: 765 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 766 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 767 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 768 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 769 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0 770 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 771 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 772 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 773 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 774 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 775 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 776 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 777 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 778 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 779 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 780 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 781 ; AVX512CD-NEXT: retq 782 ; 783 ; AVX512VPOPCNTDQ-LABEL: testv16i16: 784 ; AVX512VPOPCNTDQ: # %bb.0: 785 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 786 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 787 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 788 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 789 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 790 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 791 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 792 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 793 ; AVX512VPOPCNTDQ-NEXT: retq 794 ; 795 ; AVX512VPOPCNTDQVL-LABEL: testv16i16: 796 ; AVX512VPOPCNTDQVL: # %bb.0: 797 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 798 ; AVX512VPOPCNTDQVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 799 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 800 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 801 ; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 802 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 803 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 804 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 805 ; AVX512VPOPCNTDQVL-NEXT: retq 806 ; 807 ; BITALG_NOVLX-LABEL: testv16i16: 808 ; BITALG_NOVLX: # %bb.0: 809 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 810 ; BITALG_NOVLX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 811 ; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 812 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 813 ; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 814 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 815 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 816 ; BITALG_NOVLX-NEXT: retq 817 ; 818 ; BITALG-LABEL: testv16i16: 819 ; BITALG: # %bb.0: 820 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 821 ; BITALG-NEXT: vpsubw %ymm0, %ymm1, %ymm1 822 ; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 823 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 824 ; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm0 825 ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 826 ; BITALG-NEXT: retq 827 ; 828 ; X32-AVX-LABEL: testv16i16: 829 ; X32-AVX: # %bb.0: 830 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 831 ; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 832 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 833 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 834 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 835 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 836 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 837 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 838 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 839 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 840 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 841 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 842 ; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 843 ; X32-AVX-NEXT: vpsllw $8, %ymm0, %ymm1 844 ; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 845 ; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 846 ; X32-AVX-NEXT: retl 847 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0) 848 ret <16 x i16> %out 849 } 850 851 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { 852 ; AVX1-LABEL: testv16i16u: 853 ; AVX1: # %bb.0: 854 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 855 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 856 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 857 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 858 ; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 859 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 860 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 861 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 862 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 863 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 864 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 865 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 866 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 867 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 868 ; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 869 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 870 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 871 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 872 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 873 ; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 874 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 875 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 876 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 877 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 878 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 879 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 880 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 881 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 882 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 883 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 884 ; AVX1-NEXT: retq 885 ; 886 ; AVX2-LABEL: testv16i16u: 887 ; AVX2: # %bb.0: 888 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 889 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 890 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 891 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 892 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 893 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 894 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 895 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 896 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 897 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 898 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 899 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 900 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 901 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 902 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 903 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 904 ; AVX2-NEXT: retq 905 ; 906 ; AVX512CDVL-LABEL: testv16i16u: 907 ; AVX512CDVL: # %bb.0: 908 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 909 ; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 910 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 911 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 912 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 913 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 914 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 915 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 916 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 917 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 918 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 919 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 920 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 921 ; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1 922 ; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 923 ; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0 924 ; AVX512CDVL-NEXT: retq 925 ; 926 ; AVX512CD-LABEL: testv16i16u: 927 ; AVX512CD: # %bb.0: 928 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 929 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 930 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 931 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 932 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0 933 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 934 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 935 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 936 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 937 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 938 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 939 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 940 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 941 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 942 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 943 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 944 ; AVX512CD-NEXT: retq 945 ; 946 ; AVX512VPOPCNTDQ-LABEL: testv16i16u: 947 ; AVX512VPOPCNTDQ: # %bb.0: 948 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 949 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 950 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 951 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 952 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 953 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 954 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 955 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 956 ; AVX512VPOPCNTDQ-NEXT: retq 957 ; 958 ; AVX512VPOPCNTDQVL-LABEL: testv16i16u: 959 ; AVX512VPOPCNTDQVL: # %bb.0: 960 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 961 ; AVX512VPOPCNTDQVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 962 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 963 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 964 ; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 965 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 966 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 967 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 968 ; AVX512VPOPCNTDQVL-NEXT: retq 969 ; 970 ; BITALG_NOVLX-LABEL: testv16i16u: 971 ; BITALG_NOVLX: # %bb.0: 972 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 973 ; BITALG_NOVLX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 974 ; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 975 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 976 ; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 977 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 978 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 979 ; BITALG_NOVLX-NEXT: retq 980 ; 981 ; BITALG-LABEL: testv16i16u: 982 ; BITALG: # %bb.0: 983 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 984 ; BITALG-NEXT: vpsubw %ymm0, %ymm1, %ymm1 985 ; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 986 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 987 ; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm0 988 ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 989 ; BITALG-NEXT: retq 990 ; 991 ; X32-AVX-LABEL: testv16i16u: 992 ; X32-AVX: # %bb.0: 993 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 994 ; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 995 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 996 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 997 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 998 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 999 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 1000 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1001 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1002 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 1003 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 1004 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1005 ; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1006 ; X32-AVX-NEXT: vpsllw $8, %ymm0, %ymm1 1007 ; X32-AVX-NEXT: vpaddb %ymm0, %ymm1, %ymm0 1008 ; X32-AVX-NEXT: vpsrlw $8, %ymm0, %ymm0 1009 ; X32-AVX-NEXT: retl 1010 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1) 1011 ret <16 x i16> %out 1012 } 1013 1014 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { 1015 ; AVX1-LABEL: testv32i8: 1016 ; AVX1: # %bb.0: 1017 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1018 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1019 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 1020 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1021 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 1022 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 1023 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1024 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 1025 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1026 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1027 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1028 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1029 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1030 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 1031 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 1032 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1033 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 1034 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 1035 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1036 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1037 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1038 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1039 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1040 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1041 ; AVX1-NEXT: retq 1042 ; 1043 ; AVX2-LABEL: testv32i8: 1044 ; AVX2: # %bb.0: 1045 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1046 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1047 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1048 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1049 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1050 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1051 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1052 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1053 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1054 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1055 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1056 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1057 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1058 ; AVX2-NEXT: retq 1059 ; 1060 ; AVX512CDVL-LABEL: testv32i8: 1061 ; AVX512CDVL: # %bb.0: 1062 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1063 ; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1064 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1065 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1066 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1067 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1068 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 1069 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1070 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1071 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 1072 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1073 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1074 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1075 ; AVX512CDVL-NEXT: retq 1076 ; 1077 ; AVX512CD-LABEL: testv32i8: 1078 ; AVX512CD: # %bb.0: 1079 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1080 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1081 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 1082 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1083 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1084 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1085 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 1086 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1087 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1088 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 1089 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 1090 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1091 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1092 ; AVX512CD-NEXT: retq 1093 ; 1094 ; AVX512VPOPCNTDQ-LABEL: testv32i8: 1095 ; AVX512VPOPCNTDQ: # %bb.0: 1096 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 1097 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1098 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 1099 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1100 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1101 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1102 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 1103 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1104 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1105 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 1106 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 1107 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1108 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1109 ; AVX512VPOPCNTDQ-NEXT: retq 1110 ; 1111 ; AVX512VPOPCNTDQVL-LABEL: testv32i8: 1112 ; AVX512VPOPCNTDQVL: # %bb.0: 1113 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1114 ; AVX512VPOPCNTDQVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1115 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1116 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1117 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1118 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1119 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 1120 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1121 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1122 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 1123 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1124 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1125 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1126 ; AVX512VPOPCNTDQVL-NEXT: retq 1127 ; 1128 ; BITALG_NOVLX-LABEL: testv32i8: 1129 ; BITALG_NOVLX: # %bb.0: 1130 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1131 ; BITALG_NOVLX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1132 ; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 1133 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1134 ; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1135 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 1136 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1137 ; BITALG_NOVLX-NEXT: retq 1138 ; 1139 ; BITALG-LABEL: testv32i8: 1140 ; BITALG: # %bb.0: 1141 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 1142 ; BITALG-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1143 ; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 1144 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1145 ; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1146 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 1147 ; BITALG-NEXT: retq 1148 ; 1149 ; X32-AVX-LABEL: testv32i8: 1150 ; X32-AVX: # %bb.0: 1151 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1152 ; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1153 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 1154 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1155 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1156 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1157 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 1158 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1159 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1160 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 1161 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 1162 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1163 ; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1164 ; X32-AVX-NEXT: retl 1165 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0) 1166 ret <32 x i8> %out 1167 } 1168 1169 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { 1170 ; AVX1-LABEL: testv32i8u: 1171 ; AVX1: # %bb.0: 1172 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1173 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1174 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 1175 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1176 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 1177 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 1178 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1179 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 1180 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1181 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1182 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1183 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1184 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1185 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 1186 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 1187 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1188 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 1189 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 1190 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1191 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1192 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1193 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1194 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 1195 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1196 ; AVX1-NEXT: retq 1197 ; 1198 ; AVX2-LABEL: testv32i8u: 1199 ; AVX2: # %bb.0: 1200 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1201 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1202 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1203 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1204 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1205 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1206 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1207 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1208 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1209 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1210 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1211 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1212 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1213 ; AVX2-NEXT: retq 1214 ; 1215 ; AVX512CDVL-LABEL: testv32i8u: 1216 ; AVX512CDVL: # %bb.0: 1217 ; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1218 ; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1219 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1220 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1221 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1222 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1223 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 1224 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1225 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1226 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 1227 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1228 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1229 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1230 ; AVX512CDVL-NEXT: retq 1231 ; 1232 ; AVX512CD-LABEL: testv32i8u: 1233 ; AVX512CD: # %bb.0: 1234 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 1235 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1236 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 1237 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1238 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1239 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1240 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 1241 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1242 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1243 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 1244 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 1245 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1246 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1247 ; AVX512CD-NEXT: retq 1248 ; 1249 ; AVX512VPOPCNTDQ-LABEL: testv32i8u: 1250 ; AVX512VPOPCNTDQ: # %bb.0: 1251 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 1252 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1253 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 1254 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1255 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1256 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1257 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 1258 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1259 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1260 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 1261 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 1262 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1263 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1264 ; AVX512VPOPCNTDQ-NEXT: retq 1265 ; 1266 ; AVX512VPOPCNTDQVL-LABEL: testv32i8u: 1267 ; AVX512VPOPCNTDQVL: # %bb.0: 1268 ; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1269 ; AVX512VPOPCNTDQVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1270 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1271 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1272 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1273 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1274 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 1275 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1276 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1277 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 1278 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 1279 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1280 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1281 ; AVX512VPOPCNTDQVL-NEXT: retq 1282 ; 1283 ; BITALG_NOVLX-LABEL: testv32i8u: 1284 ; BITALG_NOVLX: # %bb.0: 1285 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1286 ; BITALG_NOVLX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1287 ; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 1288 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1289 ; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1290 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 1291 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1292 ; BITALG_NOVLX-NEXT: retq 1293 ; 1294 ; BITALG-LABEL: testv32i8u: 1295 ; BITALG: # %bb.0: 1296 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 1297 ; BITALG-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1298 ; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 1299 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1300 ; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1301 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 1302 ; BITALG-NEXT: retq 1303 ; 1304 ; X32-AVX-LABEL: testv32i8u: 1305 ; X32-AVX: # %bb.0: 1306 ; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1307 ; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 1308 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 1309 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1310 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 1311 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1312 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 1313 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 1314 ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1315 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 1316 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 1317 ; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 1318 ; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 1319 ; X32-AVX-NEXT: retl 1320 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1) 1321 ret <32 x i8> %out 1322 } 1323 1324 define <4 x i64> @foldv4i64() nounwind { 1325 ; AVX-LABEL: foldv4i64: 1326 ; AVX: # %bb.0: 1327 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 1328 ; AVX-NEXT: retq 1329 ; 1330 ; BITALG_NOVLX-LABEL: foldv4i64: 1331 ; BITALG_NOVLX: # %bb.0: 1332 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 1333 ; BITALG_NOVLX-NEXT: retq 1334 ; 1335 ; BITALG-LABEL: foldv4i64: 1336 ; BITALG: # %bb.0: 1337 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 1338 ; BITALG-NEXT: retq 1339 ; 1340 ; X32-AVX-LABEL: foldv4i64: 1341 ; X32-AVX: # %bb.0: 1342 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] 1343 ; X32-AVX-NEXT: retl 1344 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) 1345 ret <4 x i64> %out 1346 } 1347 1348 define <4 x i64> @foldv4i64u() nounwind { 1349 ; AVX-LABEL: foldv4i64u: 1350 ; AVX: # %bb.0: 1351 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 1352 ; AVX-NEXT: retq 1353 ; 1354 ; BITALG_NOVLX-LABEL: foldv4i64u: 1355 ; BITALG_NOVLX: # %bb.0: 1356 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 1357 ; BITALG_NOVLX-NEXT: retq 1358 ; 1359 ; BITALG-LABEL: foldv4i64u: 1360 ; BITALG: # %bb.0: 1361 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 1362 ; BITALG-NEXT: retq 1363 ; 1364 ; X32-AVX-LABEL: foldv4i64u: 1365 ; X32-AVX: # %bb.0: 1366 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0] 1367 ; X32-AVX-NEXT: retl 1368 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) 1369 ret <4 x i64> %out 1370 } 1371 1372 define <8 x i32> @foldv8i32() nounwind { 1373 ; AVX-LABEL: foldv8i32: 1374 ; AVX: # %bb.0: 1375 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1376 ; AVX-NEXT: retq 1377 ; 1378 ; BITALG_NOVLX-LABEL: foldv8i32: 1379 ; BITALG_NOVLX: # %bb.0: 1380 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1381 ; BITALG_NOVLX-NEXT: retq 1382 ; 1383 ; BITALG-LABEL: foldv8i32: 1384 ; BITALG: # %bb.0: 1385 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1386 ; BITALG-NEXT: retq 1387 ; 1388 ; X32-AVX-LABEL: foldv8i32: 1389 ; X32-AVX: # %bb.0: 1390 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1391 ; X32-AVX-NEXT: retl 1392 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) 1393 ret <8 x i32> %out 1394 } 1395 1396 define <8 x i32> @foldv8i32u() nounwind { 1397 ; AVX-LABEL: foldv8i32u: 1398 ; AVX: # %bb.0: 1399 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1400 ; AVX-NEXT: retq 1401 ; 1402 ; BITALG_NOVLX-LABEL: foldv8i32u: 1403 ; BITALG_NOVLX: # %bb.0: 1404 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1405 ; BITALG_NOVLX-NEXT: retq 1406 ; 1407 ; BITALG-LABEL: foldv8i32u: 1408 ; BITALG: # %bb.0: 1409 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1410 ; BITALG-NEXT: retq 1411 ; 1412 ; X32-AVX-LABEL: foldv8i32u: 1413 ; X32-AVX: # %bb.0: 1414 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 1415 ; X32-AVX-NEXT: retl 1416 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) 1417 ret <8 x i32> %out 1418 } 1419 1420 define <16 x i16> @foldv16i16() nounwind { 1421 ; AVX-LABEL: foldv16i16: 1422 ; AVX: # %bb.0: 1423 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1424 ; AVX-NEXT: retq 1425 ; 1426 ; BITALG_NOVLX-LABEL: foldv16i16: 1427 ; BITALG_NOVLX: # %bb.0: 1428 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1429 ; BITALG_NOVLX-NEXT: retq 1430 ; 1431 ; BITALG-LABEL: foldv16i16: 1432 ; BITALG: # %bb.0: 1433 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1434 ; BITALG-NEXT: retq 1435 ; 1436 ; X32-AVX-LABEL: foldv16i16: 1437 ; X32-AVX: # %bb.0: 1438 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1439 ; X32-AVX-NEXT: retl 1440 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) 1441 ret <16 x i16> %out 1442 } 1443 1444 define <16 x i16> @foldv16i16u() nounwind { 1445 ; AVX-LABEL: foldv16i16u: 1446 ; AVX: # %bb.0: 1447 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1448 ; AVX-NEXT: retq 1449 ; 1450 ; BITALG_NOVLX-LABEL: foldv16i16u: 1451 ; BITALG_NOVLX: # %bb.0: 1452 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1453 ; BITALG_NOVLX-NEXT: retq 1454 ; 1455 ; BITALG-LABEL: foldv16i16u: 1456 ; BITALG: # %bb.0: 1457 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1458 ; BITALG-NEXT: retq 1459 ; 1460 ; X32-AVX-LABEL: foldv16i16u: 1461 ; X32-AVX: # %bb.0: 1462 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 1463 ; X32-AVX-NEXT: retl 1464 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) 1465 ret <16 x i16> %out 1466 } 1467 1468 define <32 x i8> @foldv32i8() nounwind { 1469 ; AVX-LABEL: foldv32i8: 1470 ; AVX: # %bb.0: 1471 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1472 ; AVX-NEXT: retq 1473 ; 1474 ; BITALG_NOVLX-LABEL: foldv32i8: 1475 ; BITALG_NOVLX: # %bb.0: 1476 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1477 ; BITALG_NOVLX-NEXT: retq 1478 ; 1479 ; BITALG-LABEL: foldv32i8: 1480 ; BITALG: # %bb.0: 1481 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1482 ; BITALG-NEXT: retq 1483 ; 1484 ; X32-AVX-LABEL: foldv32i8: 1485 ; X32-AVX: # %bb.0: 1486 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1487 ; X32-AVX-NEXT: retl 1488 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) 1489 ret <32 x i8> %out 1490 } 1491 1492 define <32 x i8> @foldv32i8u() nounwind { 1493 ; AVX-LABEL: foldv32i8u: 1494 ; AVX: # %bb.0: 1495 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1496 ; AVX-NEXT: retq 1497 ; 1498 ; BITALG_NOVLX-LABEL: foldv32i8u: 1499 ; BITALG_NOVLX: # %bb.0: 1500 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1501 ; BITALG_NOVLX-NEXT: retq 1502 ; 1503 ; BITALG-LABEL: foldv32i8u: 1504 ; BITALG: # %bb.0: 1505 ; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1506 ; BITALG-NEXT: retq 1507 ; 1508 ; X32-AVX-LABEL: foldv32i8u: 1509 ; X32-AVX: # %bb.0: 1510 ; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 1511 ; X32-AVX-NEXT: retl 1512 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) 1513 ret <32 x i8> %out 1514 } 1515 1516 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) 1517 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1) 1518 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) 1519 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1) 1520