1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG 7 8 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { 9 ; AVX512CD-LABEL: testv8i64: 10 ; AVX512CD: # %bb.0: 11 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 12 ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 13 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 14 ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 15 ; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm0 16 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 17 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 18 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 19 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 20 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 21 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 22 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 23 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 24 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 25 ; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3 26 ; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 27 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5 28 ; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5 29 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 30 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 31 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 32 ; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 33 ; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 34 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 35 ; AVX512CD-NEXT: retq 36 ; 37 ; AVX512CDBW-LABEL: testv8i64: 38 ; AVX512CDBW: # %bb.0: 39 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 40 ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 41 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 42 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 43 ; AVX512CDBW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 44 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 45 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3 46 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 47 ; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 48 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 49 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 50 ; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 51 ; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 52 ; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 53 ; AVX512CDBW-NEXT: retq 54 ; 55 ; AVX512BW-LABEL: testv8i64: 56 ; AVX512BW: # %bb.0: 57 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 58 ; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 59 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 60 ; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 61 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 62 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 63 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 64 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 65 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 66 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 67 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 68 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 69 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 70 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 71 ; AVX512BW-NEXT: retq 72 ; 73 ; AVX512VPOPCNTDQ-LABEL: testv8i64: 74 ; AVX512VPOPCNTDQ: # %bb.0: 75 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 76 ; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 77 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 78 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 79 ; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 80 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 81 ; AVX512VPOPCNTDQ-NEXT: retq 82 ; 83 ; BITALG-LABEL: testv8i64: 84 ; BITALG: # %bb.0: 85 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 86 ; BITALG-NEXT: vpsubq %zmm0, %zmm1, %zmm2 87 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 88 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 89 ; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 90 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 91 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 92 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 93 ; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 94 ; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 95 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 96 ; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 97 ; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 98 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 99 ; BITALG-NEXT: retq 100 %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0) 101 ret <8 x i64> %out 102 } 103 104 define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { 105 ; AVX512CD-LABEL: testv8i64u: 106 ; AVX512CD: # %bb.0: 107 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 108 ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 109 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 110 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 111 ; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] 112 ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0 113 ; AVX512CD-NEXT: retq 114 ; 115 ; AVX512CDBW-LABEL: testv8i64u: 116 ; AVX512CDBW: # %bb.0: 117 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 118 ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1 119 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 120 ; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0 121 ; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] 122 ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0 123 ; AVX512CDBW-NEXT: retq 124 ; 125 ; AVX512BW-LABEL: testv8i64u: 126 ; AVX512BW: # %bb.0: 127 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 128 ; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 129 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 130 ; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 131 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 132 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 133 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 134 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 135 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 136 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 137 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 138 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 139 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 140 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 141 ; AVX512BW-NEXT: retq 142 ; 143 ; AVX512VPOPCNTDQ-LABEL: testv8i64u: 144 ; AVX512VPOPCNTDQ: # %bb.0: 145 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 146 ; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 147 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 148 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 149 ; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 150 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 151 ; AVX512VPOPCNTDQ-NEXT: retq 152 ; 153 ; BITALG-LABEL: testv8i64u: 154 ; BITALG: # %bb.0: 155 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 156 ; BITALG-NEXT: vpsubq %zmm0, %zmm1, %zmm2 157 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 158 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 159 ; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 160 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 161 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 162 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 163 ; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 164 ; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 165 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 166 ; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 167 ; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 168 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 169 ; BITALG-NEXT: retq 170 %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1) 171 ret <8 x i64> %out 172 } 173 174 define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { 175 ; AVX512CD-LABEL: testv16i32: 176 ; AVX512CD: # %bb.0: 177 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 178 ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 179 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 180 ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 181 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm0 182 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 183 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 184 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 185 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 186 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 187 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 188 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 189 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 190 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 191 ; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3 192 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] 193 ; AVX512CD-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 194 ; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] 195 ; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 196 ; AVX512CD-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 197 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5 198 ; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5 199 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 200 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 201 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 202 ; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 203 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] 204 ; AVX512CD-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 205 ; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] 206 ; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 207 ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 208 ; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 209 ; AVX512CD-NEXT: retq 210 ; 211 ; AVX512CDBW-LABEL: testv16i32: 212 ; AVX512CDBW: # %bb.0: 213 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 214 ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 215 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 216 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 217 ; AVX512CDBW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 218 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 219 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3 220 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 221 ; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 222 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 223 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 224 ; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 225 ; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 226 ; AVX512CDBW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 227 ; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 228 ; AVX512CDBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 229 ; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 230 ; AVX512CDBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 231 ; AVX512CDBW-NEXT: retq 232 ; 233 ; AVX512BW-LABEL: testv16i32: 234 ; AVX512BW: # %bb.0: 235 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 236 ; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 237 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 238 ; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 239 ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 240 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 241 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 242 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 243 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 244 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 245 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 246 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 247 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 248 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 249 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 250 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 251 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 252 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 253 ; AVX512BW-NEXT: retq 254 ; 255 ; AVX512VPOPCNTDQ-LABEL: testv16i32: 256 ; AVX512VPOPCNTDQ: # %bb.0: 257 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 258 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 259 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 260 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 261 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0 262 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 263 ; AVX512VPOPCNTDQ-NEXT: retq 264 ; 265 ; BITALG-LABEL: testv16i32: 266 ; BITALG: # %bb.0: 267 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 268 ; BITALG-NEXT: vpsubd %zmm0, %zmm1, %zmm2 269 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 270 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 271 ; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 272 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 273 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 274 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 275 ; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 276 ; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 277 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 278 ; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 279 ; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 280 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 281 ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 282 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 283 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 284 ; BITALG-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 285 ; BITALG-NEXT: retq 286 %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0) 287 ret <16 x i32> %out 288 } 289 290 define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { 291 ; AVX512CD-LABEL: testv16i32u: 292 ; AVX512CD: # %bb.0: 293 ; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 294 ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 295 ; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 296 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 297 ; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 298 ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 299 ; AVX512CD-NEXT: retq 300 ; 301 ; AVX512CDBW-LABEL: testv16i32u: 302 ; AVX512CDBW: # %bb.0: 303 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 304 ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1 305 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 306 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 307 ; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 308 ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 309 ; AVX512CDBW-NEXT: retq 310 ; 311 ; AVX512BW-LABEL: testv16i32u: 312 ; AVX512BW: # %bb.0: 313 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 314 ; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 315 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 316 ; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 317 ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 318 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 319 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 320 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 321 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 322 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 323 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 324 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 325 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 326 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 327 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 328 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 329 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 330 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 331 ; AVX512BW-NEXT: retq 332 ; 333 ; AVX512VPOPCNTDQ-LABEL: testv16i32u: 334 ; AVX512VPOPCNTDQ: # %bb.0: 335 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 336 ; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 337 ; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 338 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 339 ; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0 340 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 341 ; AVX512VPOPCNTDQ-NEXT: retq 342 ; 343 ; BITALG-LABEL: testv16i32u: 344 ; BITALG: # %bb.0: 345 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 346 ; BITALG-NEXT: vpsubd %zmm0, %zmm1, %zmm2 347 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 348 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 349 ; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 350 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 351 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 352 ; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 353 ; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 354 ; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 355 ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 356 ; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 357 ; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 358 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 359 ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 360 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 361 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 362 ; BITALG-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 363 ; BITALG-NEXT: retq 364 %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1) 365 ret <16 x i32> %out 366 } 367 368 define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { 369 ; AVX512CD-LABEL: testv32i16: 370 ; AVX512CD: # %bb.0: 371 ; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 372 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3 373 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 374 ; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 375 ; AVX512CD-NEXT: vpaddw %ymm3, %ymm0, %ymm0 376 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 377 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 378 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 379 ; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 380 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 381 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 382 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 383 ; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 384 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5 385 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0 386 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 387 ; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2 388 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 389 ; AVX512CD-NEXT: vpaddw %ymm3, %ymm1, %ymm1 390 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 391 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 392 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 393 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 394 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 395 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 396 ; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2 397 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1 398 ; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1 399 ; AVX512CD-NEXT: retq 400 ; 401 ; AVX512CDBW-LABEL: testv32i16: 402 ; AVX512CDBW: # %bb.0: 403 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 404 ; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 405 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 406 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 407 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 408 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 409 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 410 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 411 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 412 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 413 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 414 ; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 415 ; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 416 ; AVX512CDBW-NEXT: vpsllw $8, %zmm0, %zmm1 417 ; AVX512CDBW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 418 ; AVX512CDBW-NEXT: vpsrlw $8, %zmm0, %zmm0 419 ; AVX512CDBW-NEXT: retq 420 ; 421 ; AVX512BW-LABEL: testv32i16: 422 ; AVX512BW: # %bb.0: 423 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 424 ; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 425 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 426 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 427 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 428 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 429 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 430 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 431 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 432 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 433 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 434 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 435 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 436 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 437 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 438 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 439 ; AVX512BW-NEXT: retq 440 ; 441 ; AVX512VPOPCNTDQ-LABEL: testv32i16: 442 ; AVX512VPOPCNTDQ: # %bb.0: 443 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 444 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 445 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 446 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 447 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 448 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 449 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 450 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 451 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 452 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 453 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 454 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 455 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 456 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 457 ; AVX512VPOPCNTDQ-NEXT: retq 458 ; 459 ; BITALG-LABEL: testv32i16: 460 ; BITALG: # %bb.0: 461 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 462 ; BITALG-NEXT: vpsubw %zmm0, %zmm1, %zmm1 463 ; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 464 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 465 ; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm0 466 ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 467 ; BITALG-NEXT: retq 468 %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0) 469 ret <32 x i16> %out 470 } 471 472 define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { 473 ; AVX512CD-LABEL: testv32i16u: 474 ; AVX512CD: # %bb.0: 475 ; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 476 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3 477 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 478 ; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 479 ; AVX512CD-NEXT: vpaddw %ymm3, %ymm0, %ymm0 480 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 481 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 482 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 483 ; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 484 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 485 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 486 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 487 ; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 488 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5 489 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0 490 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 491 ; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2 492 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 493 ; AVX512CD-NEXT: vpaddw %ymm3, %ymm1, %ymm1 494 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 495 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 496 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 497 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 498 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 499 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 500 ; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2 501 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1 502 ; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1 503 ; AVX512CD-NEXT: retq 504 ; 505 ; AVX512CDBW-LABEL: testv32i16u: 506 ; AVX512CDBW: # %bb.0: 507 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 508 ; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 509 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 510 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 511 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 512 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 513 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 514 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 515 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 516 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 517 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 518 ; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 519 ; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 520 ; AVX512CDBW-NEXT: vpsllw $8, %zmm0, %zmm1 521 ; AVX512CDBW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 522 ; AVX512CDBW-NEXT: vpsrlw $8, %zmm0, %zmm0 523 ; AVX512CDBW-NEXT: retq 524 ; 525 ; AVX512BW-LABEL: testv32i16u: 526 ; AVX512BW: # %bb.0: 527 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 528 ; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 529 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 530 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 531 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 532 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 533 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 534 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 535 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 536 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 537 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 538 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 539 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 540 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1 541 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 542 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 543 ; AVX512BW-NEXT: retq 544 ; 545 ; AVX512VPOPCNTDQ-LABEL: testv32i16u: 546 ; AVX512VPOPCNTDQ: # %bb.0: 547 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 548 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 549 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 550 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 551 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 552 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 553 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 554 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 555 ; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 556 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 557 ; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 558 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 559 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 560 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 561 ; AVX512VPOPCNTDQ-NEXT: retq 562 ; 563 ; BITALG-LABEL: testv32i16u: 564 ; BITALG: # %bb.0: 565 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 566 ; BITALG-NEXT: vpsubw %zmm0, %zmm1, %zmm1 567 ; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 568 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 569 ; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm0 570 ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 571 ; BITALG-NEXT: retq 572 %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1) 573 ret <32 x i16> %out 574 } 575 576 define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { 577 ; AVX512CD-LABEL: testv64i8: 578 ; AVX512CD: # %bb.0: 579 ; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 580 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3 581 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 582 ; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 583 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 584 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 585 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 586 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 587 ; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 588 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 589 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 590 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 591 ; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 592 ; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2 593 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 594 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 595 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 596 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 597 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 598 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 599 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 600 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 601 ; AVX512CD-NEXT: retq 602 ; 603 ; AVX512CDBW-LABEL: testv64i8: 604 ; AVX512CDBW: # %bb.0: 605 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 606 ; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 607 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 608 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 609 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 610 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 611 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 612 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 613 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 614 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 615 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 616 ; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 617 ; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 618 ; AVX512CDBW-NEXT: retq 619 ; 620 ; AVX512BW-LABEL: testv64i8: 621 ; AVX512BW: # %bb.0: 622 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 623 ; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 624 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 625 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 626 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 627 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 628 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 629 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 630 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 631 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 632 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 633 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 634 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 635 ; AVX512BW-NEXT: retq 636 ; 637 ; AVX512VPOPCNTDQ-LABEL: testv64i8: 638 ; AVX512VPOPCNTDQ: # %bb.0: 639 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 640 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 641 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 642 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 643 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 644 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 645 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 646 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 647 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 648 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 649 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 650 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 651 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 652 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 653 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 654 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 655 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 656 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 657 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 658 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 659 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 660 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 661 ; AVX512VPOPCNTDQ-NEXT: retq 662 ; 663 ; BITALG-LABEL: testv64i8: 664 ; BITALG: # %bb.0: 665 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 666 ; BITALG-NEXT: vpsubb %zmm0, %zmm1, %zmm1 667 ; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 668 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 669 ; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm0 670 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 671 ; BITALG-NEXT: retq 672 %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0) 673 ret <64 x i8> %out 674 } 675 676 define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { 677 ; AVX512CD-LABEL: testv64i8u: 678 ; AVX512CD: # %bb.0: 679 ; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 680 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3 681 ; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 682 ; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 683 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 684 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 685 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 686 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 687 ; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 688 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 689 ; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 690 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 691 ; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 692 ; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2 693 ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 694 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 695 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 696 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 697 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 698 ; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 699 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 700 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 701 ; AVX512CD-NEXT: retq 702 ; 703 ; AVX512CDBW-LABEL: testv64i8u: 704 ; AVX512CDBW: # %bb.0: 705 ; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 706 ; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 707 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 708 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 709 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 710 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 711 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 712 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 713 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 714 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 715 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 716 ; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 717 ; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 718 ; AVX512CDBW-NEXT: retq 719 ; 720 ; AVX512BW-LABEL: testv64i8u: 721 ; AVX512BW: # %bb.0: 722 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 723 ; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 724 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 725 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 726 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 727 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 728 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 729 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 730 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 731 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 732 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 733 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 734 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 735 ; AVX512BW-NEXT: retq 736 ; 737 ; AVX512VPOPCNTDQ-LABEL: testv64i8u: 738 ; AVX512VPOPCNTDQ: # %bb.0: 739 ; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 740 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 741 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 742 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 743 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 744 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 745 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 746 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 747 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 748 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 749 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 750 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 751 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 752 ; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 753 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 754 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 755 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 756 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 757 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 758 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 759 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 760 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 761 ; AVX512VPOPCNTDQ-NEXT: retq 762 ; 763 ; BITALG-LABEL: testv64i8u: 764 ; BITALG: # %bb.0: 765 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 766 ; BITALG-NEXT: vpsubb %zmm0, %zmm1, %zmm1 767 ; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 768 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 769 ; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm0 770 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 771 ; BITALG-NEXT: retq 772 %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1) 773 ret <64 x i8> %out 774 } 775 776 declare <8 x i64> @llvm.cttz.v8i64(<8 x i64>, i1) 777 declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1) 778 declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>, i1) 779 declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>, i1) 780