1 ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD 6 7 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { 8 ; AVX1-LABEL: testv4i64: 9 ; AVX1: # BB#0: 10 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 11 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 12 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 13 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 14 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 15 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 16 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 17 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] 18 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 19 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 20 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 21 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 22 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 23 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 24 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 25 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 26 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 27 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 28 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 29 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 30 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 31 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 32 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 33 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 34 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 35 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 36 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 37 ; AVX1-NEXT: retq 38 ; 39 ; AVX2-LABEL: testv4i64: 40 ; AVX2: # BB#0: 41 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 42 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 43 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 44 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 45 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 46 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 47 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 48 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 49 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 50 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 51 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 52 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 53 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 54 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 55 ; AVX2-NEXT: retq 56 ; 57 ; AVX512CDVL-LABEL: testv4i64: 58 ; AVX512CDVL: # BB#0: 59 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 60 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2 61 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0 62 ; AVX512CDVL-NEXT: vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0 63 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 64 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3 65 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 66 ; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 67 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 68 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0 69 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 70 ; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 71 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 72 ; AVX512CDVL-NEXT: retq 73 ; 74 ; AVX512CD-LABEL: testv4i64: 75 ; AVX512CD: # BB#0: 76 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 77 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2 78 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 79 ; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 80 ; AVX512CD-NEXT: vpsubq %ymm2, %ymm0, %ymm0 81 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 82 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 83 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 84 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 85 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 86 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 87 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 88 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 89 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 90 ; AVX512CD-NEXT: retq 91 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) 92 ret <4 x i64> %out 93 } 94 95 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { 96 ; AVX1-LABEL: testv4i64u: 97 ; AVX1: # BB#0: 98 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 99 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 100 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 101 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 102 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 103 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 104 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 105 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] 106 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 107 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 108 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 109 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 110 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 111 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 112 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 113 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 114 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 115 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 116 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 117 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 118 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 119 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 120 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 121 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 122 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 123 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 124 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 125 ; AVX1-NEXT: retq 126 ; 127 ; AVX2-LABEL: testv4i64u: 128 ; AVX2: # BB#0: 129 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 130 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 131 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 132 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 133 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 134 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 135 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 136 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 137 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 138 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 139 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 140 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 141 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 142 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 143 ; AVX2-NEXT: retq 144 ; 145 ; AVX512CDVL-LABEL: testv4i64u: 146 ; AVX512CDVL: # BB#0: 147 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 148 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 149 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 150 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 151 ; AVX512CDVL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 152 ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 153 ; AVX512CDVL-NEXT: retq 154 ; 155 ; AVX512CD-LABEL: testv4i64u: 156 ; AVX512CD: # BB#0: 157 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 158 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1 159 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 160 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 161 ; AVX512CD-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 162 ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 163 ; AVX512CD-NEXT: retq 164 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) 165 ret <4 x i64> %out 166 } 167 168 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { 169 ; AVX1-LABEL: testv8i32: 170 ; AVX1: # BB#0: 171 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 172 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 173 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 174 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 175 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 176 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 177 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 178 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] 179 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 180 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 181 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 182 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 183 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 184 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 185 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 186 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 187 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 188 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 189 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 190 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 191 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 192 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 193 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 194 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 195 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 196 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 197 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 198 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 199 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 200 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 201 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 202 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 203 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 204 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 205 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 206 ; AVX1-NEXT: retq 207 ; 208 ; AVX2-LABEL: testv8i32: 209 ; AVX2: # BB#0: 210 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 211 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 212 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 213 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 214 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 215 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 216 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 217 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 218 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 219 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 220 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 221 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 222 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 223 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 224 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 225 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 226 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 227 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 228 ; AVX2-NEXT: retq 229 ; 230 ; AVX512CDVL-LABEL: testv8i32: 231 ; AVX512CDVL: # BB#0: 232 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 233 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2 234 ; AVX512CDVL-NEXT: vpandd %ymm2, %ymm0, %ymm0 235 ; AVX512CDVL-NEXT: vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0 236 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 237 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm3 238 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 239 ; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 240 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 241 ; AVX512CDVL-NEXT: vpandq %ymm2, %ymm0, %ymm0 242 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 243 ; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 244 ; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 245 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 246 ; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 247 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 248 ; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 249 ; AVX512CDVL-NEXT: retq 250 ; 251 ; AVX512CD-LABEL: testv8i32: 252 ; AVX512CD: # BB#0: 253 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 254 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2 255 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 256 ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 257 ; AVX512CD-NEXT: vpsubd %ymm2, %ymm0, %ymm0 258 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 259 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 260 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 261 ; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 262 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 263 ; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 264 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 265 ; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 266 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 267 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 268 ; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 269 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 270 ; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 271 ; AVX512CD-NEXT: retq 272 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) 273 ret <8 x i32> %out 274 } 275 276 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { 277 ; AVX1-LABEL: testv8i32u: 278 ; AVX1: # BB#0: 279 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 280 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 281 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 282 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 283 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 284 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 285 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 286 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] 287 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 288 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 289 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 290 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 291 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 292 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 293 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 294 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 295 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 296 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 297 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 298 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 299 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 300 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 301 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 302 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 303 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 304 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 305 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 306 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 307 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 308 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 309 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 310 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 311 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 312 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 313 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 314 ; AVX1-NEXT: retq 315 ; 316 ; AVX2-LABEL: testv8i32u: 317 ; AVX2: # BB#0: 318 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 319 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 320 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 321 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 322 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 323 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 324 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 325 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 326 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 327 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 328 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 329 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 330 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 331 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 332 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 333 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 334 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 335 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 336 ; AVX2-NEXT: retq 337 ; 338 ; AVX512CDVL-LABEL: testv8i32u: 339 ; AVX512CDVL: # BB#0: 340 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 341 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 342 ; AVX512CDVL-NEXT: vpandd %ymm1, %ymm0, %ymm0 343 ; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 344 ; AVX512CDVL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 345 ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 346 ; AVX512CDVL-NEXT: retq 347 ; 348 ; AVX512CD-LABEL: testv8i32u: 349 ; AVX512CD: # BB#0: 350 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 351 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1 352 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 353 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 354 ; AVX512CD-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 355 ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 356 ; AVX512CD-NEXT: retq 357 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1) 358 ret <8 x i32> %out 359 } 360 361 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { 362 ; AVX1-LABEL: testv16i16: 363 ; AVX1: # BB#0: 364 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 365 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 366 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 367 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 368 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 369 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 370 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] 371 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 372 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 373 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 374 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 375 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 376 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 377 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 378 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 379 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 380 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 381 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 382 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 383 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 384 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 385 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 386 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 387 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 388 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 389 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 390 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 391 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 392 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 393 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 394 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 395 ; AVX1-NEXT: retq 396 ; 397 ; AVX2-LABEL: testv16i16: 398 ; AVX2: # BB#0: 399 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 400 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 401 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 402 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 403 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 404 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 405 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 406 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 407 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 408 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 409 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 410 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 411 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 412 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 413 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 414 ; AVX2-NEXT: retq 415 ; 416 ; AVX512CDVL-LABEL: testv16i16: 417 ; AVX512CDVL: # BB#0: 418 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 419 ; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 420 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 421 ; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 422 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 423 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 424 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 425 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 426 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 427 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 428 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 429 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 430 ; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1 431 ; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 432 ; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0 433 ; AVX512CDVL-NEXT: retq 434 ; 435 ; AVX512CD-LABEL: testv16i16: 436 ; AVX512CD: # BB#0: 437 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 438 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 439 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 440 ; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 441 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 442 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 443 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 444 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 445 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 446 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 447 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 448 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 449 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 450 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 451 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 452 ; AVX512CD-NEXT: retq 453 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0) 454 ret <16 x i16> %out 455 } 456 457 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { 458 ; AVX1-LABEL: testv16i16u: 459 ; AVX1: # BB#0: 460 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 461 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 462 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 463 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 464 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 465 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 466 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] 467 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 468 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 469 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 470 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 471 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 472 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 473 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 474 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 475 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 476 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 477 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 478 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 479 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 480 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 481 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 482 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 483 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 484 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 485 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 486 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 487 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 488 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 489 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 490 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 491 ; AVX1-NEXT: retq 492 ; 493 ; AVX2-LABEL: testv16i16u: 494 ; AVX2: # BB#0: 495 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 496 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 497 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 498 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 499 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 500 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 501 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 502 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 503 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 504 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 505 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 506 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 507 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 508 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 509 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 510 ; AVX2-NEXT: retq 511 ; 512 ; AVX512CDVL-LABEL: testv16i16u: 513 ; AVX512CDVL: # BB#0: 514 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 515 ; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 516 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 517 ; AVX512CDVL-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 518 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 519 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 520 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 521 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 522 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 523 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 524 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 525 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 526 ; AVX512CDVL-NEXT: vpsllw $8, %ymm0, %ymm1 527 ; AVX512CDVL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 528 ; AVX512CDVL-NEXT: vpsrlw $8, %ymm0, %ymm0 529 ; AVX512CDVL-NEXT: retq 530 ; 531 ; AVX512CD-LABEL: testv16i16u: 532 ; AVX512CD: # BB#0: 533 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 534 ; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 535 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 536 ; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 537 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 538 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 539 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 540 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 541 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 542 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 543 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 544 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 545 ; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm1 546 ; AVX512CD-NEXT: vpaddb %ymm0, %ymm1, %ymm0 547 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 548 ; AVX512CD-NEXT: retq 549 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1) 550 ret <16 x i16> %out 551 } 552 553 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { 554 ; AVX1-LABEL: testv32i8: 555 ; AVX1: # BB#0: 556 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 557 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 558 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 559 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 560 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 561 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 562 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 563 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 564 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 565 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 566 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 567 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 568 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 569 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 570 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 571 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 572 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 573 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 574 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 575 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 576 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 577 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 578 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 579 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 580 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 581 ; AVX1-NEXT: retq 582 ; 583 ; AVX2-LABEL: testv32i8: 584 ; AVX2: # BB#0: 585 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 586 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 587 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 588 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 589 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 590 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 591 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 592 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 593 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 594 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 595 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 596 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 597 ; AVX2-NEXT: retq 598 ; 599 ; AVX512CDVL-LABEL: testv32i8: 600 ; AVX512CDVL: # BB#0: 601 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 602 ; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 603 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 604 ; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 605 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 606 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 607 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 608 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 609 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 610 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 611 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 612 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 613 ; AVX512CDVL-NEXT: retq 614 ; 615 ; AVX512CD-LABEL: testv32i8: 616 ; AVX512CD: # BB#0: 617 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 618 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 619 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 620 ; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 621 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 622 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 623 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 624 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 625 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 626 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 627 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 628 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 629 ; AVX512CD-NEXT: retq 630 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0) 631 ret <32 x i8> %out 632 } 633 634 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { 635 ; AVX1-LABEL: testv32i8u: 636 ; AVX1: # BB#0: 637 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 638 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 639 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 640 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 641 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 642 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 643 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 644 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 645 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 646 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 647 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 648 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 649 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 650 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 651 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 652 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 653 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 654 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 655 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 656 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 657 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 658 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 659 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 660 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 661 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 662 ; AVX1-NEXT: retq 663 ; 664 ; AVX2-LABEL: testv32i8u: 665 ; AVX2: # BB#0: 666 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 667 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 668 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 669 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 670 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 671 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 672 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 673 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 674 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 675 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 676 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 677 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 678 ; AVX2-NEXT: retq 679 ; 680 ; AVX512CDVL-LABEL: testv32i8u: 681 ; AVX512CDVL: # BB#0: 682 ; AVX512CDVL-NEXT: vpxord %ymm1, %ymm1, %ymm1 683 ; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 684 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 685 ; AVX512CDVL-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 686 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 687 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm2 688 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 689 ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 690 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 691 ; AVX512CDVL-NEXT: vpandq %ymm1, %ymm0, %ymm0 692 ; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 693 ; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 694 ; AVX512CDVL-NEXT: retq 695 ; 696 ; AVX512CD-LABEL: testv32i8u: 697 ; AVX512CD: # BB#0: 698 ; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1 699 ; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 700 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 701 ; AVX512CD-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 702 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 703 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 704 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 705 ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 706 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 707 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 708 ; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 709 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 710 ; AVX512CD-NEXT: retq 711 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1) 712 ret <32 x i8> %out 713 } 714 715 define <4 x i64> @foldv4i64() nounwind { 716 ; AVX1-LABEL: foldv4i64: 717 ; AVX1: # BB#0: 718 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 719 ; AVX1-NEXT: retq 720 ; 721 ; AVX2-LABEL: foldv4i64: 722 ; AVX2: # BB#0: 723 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 724 ; AVX2-NEXT: retq 725 ; 726 ; AVX512CDVL-LABEL: foldv4i64: 727 ; AVX512CDVL: # BB#0: 728 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0] 729 ; AVX512CDVL-NEXT: retq 730 ; 731 ; AVX512CD-LABEL: foldv4i64: 732 ; AVX512CD: # BB#0: 733 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 734 ; AVX512CD-NEXT: retq 735 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) 736 ret <4 x i64> %out 737 } 738 739 define <4 x i64> @foldv4i64u() nounwind { 740 ; AVX1-LABEL: foldv4i64u: 741 ; AVX1: # BB#0: 742 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 743 ; AVX1-NEXT: retq 744 ; 745 ; AVX2-LABEL: foldv4i64u: 746 ; AVX2: # BB#0: 747 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 748 ; AVX2-NEXT: retq 749 ; 750 ; AVX512CDVL-LABEL: foldv4i64u: 751 ; AVX512CDVL: # BB#0: 752 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0] 753 ; AVX512CDVL-NEXT: retq 754 ; 755 ; AVX512CD-LABEL: foldv4i64u: 756 ; AVX512CD: # BB#0: 757 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 758 ; AVX512CD-NEXT: retq 759 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) 760 ret <4 x i64> %out 761 } 762 763 define <8 x i32> @foldv8i32() nounwind { 764 ; AVX1-LABEL: foldv8i32: 765 ; AVX1: # BB#0: 766 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 767 ; AVX1-NEXT: retq 768 ; 769 ; AVX2-LABEL: foldv8i32: 770 ; AVX2: # BB#0: 771 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 772 ; AVX2-NEXT: retq 773 ; 774 ; AVX512CDVL-LABEL: foldv8i32: 775 ; AVX512CDVL: # BB#0: 776 ; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 777 ; AVX512CDVL-NEXT: retq 778 ; 779 ; AVX512CD-LABEL: foldv8i32: 780 ; AVX512CD: # BB#0: 781 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 782 ; AVX512CD-NEXT: retq 783 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) 784 ret <8 x i32> %out 785 } 786 787 define <8 x i32> @foldv8i32u() nounwind { 788 ; AVX1-LABEL: foldv8i32u: 789 ; AVX1: # BB#0: 790 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 791 ; AVX1-NEXT: retq 792 ; 793 ; AVX2-LABEL: foldv8i32u: 794 ; AVX2: # BB#0: 795 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 796 ; AVX2-NEXT: retq 797 ; 798 ; AVX512CDVL-LABEL: foldv8i32u: 799 ; AVX512CDVL: # BB#0: 800 ; AVX512CDVL-NEXT: vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 801 ; AVX512CDVL-NEXT: retq 802 ; 803 ; AVX512CD-LABEL: foldv8i32u: 804 ; AVX512CD: # BB#0: 805 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 806 ; AVX512CD-NEXT: retq 807 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) 808 ret <8 x i32> %out 809 } 810 811 define <16 x i16> @foldv16i16() nounwind { 812 ; AVX1-LABEL: foldv16i16: 813 ; AVX1: # BB#0: 814 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 815 ; AVX1-NEXT: retq 816 ; 817 ; AVX2-LABEL: foldv16i16: 818 ; AVX2: # BB#0: 819 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 820 ; AVX2-NEXT: retq 821 ; 822 ; AVX512CDVL-LABEL: foldv16i16: 823 ; AVX512CDVL: # BB#0: 824 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 825 ; AVX512CDVL-NEXT: retq 826 ; 827 ; AVX512CD-LABEL: foldv16i16: 828 ; AVX512CD: # BB#0: 829 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 830 ; AVX512CD-NEXT: retq 831 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) 832 ret <16 x i16> %out 833 } 834 835 define <16 x i16> @foldv16i16u() nounwind { 836 ; AVX1-LABEL: foldv16i16u: 837 ; AVX1: # BB#0: 838 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 839 ; AVX1-NEXT: retq 840 ; 841 ; AVX2-LABEL: foldv16i16u: 842 ; AVX2: # BB#0: 843 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 844 ; AVX2-NEXT: retq 845 ; 846 ; AVX512CDVL-LABEL: foldv16i16u: 847 ; AVX512CDVL: # BB#0: 848 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 849 ; AVX512CDVL-NEXT: retq 850 ; 851 ; AVX512CD-LABEL: foldv16i16u: 852 ; AVX512CD: # BB#0: 853 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 854 ; AVX512CD-NEXT: retq 855 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) 856 ret <16 x i16> %out 857 } 858 859 define <32 x i8> @foldv32i8() nounwind { 860 ; AVX1-LABEL: foldv32i8: 861 ; AVX1: # BB#0: 862 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 863 ; AVX1-NEXT: retq 864 ; 865 ; AVX2-LABEL: foldv32i8: 866 ; AVX2: # BB#0: 867 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 868 ; AVX2-NEXT: retq 869 ; 870 ; AVX512CDVL-LABEL: foldv32i8: 871 ; AVX512CDVL: # BB#0: 872 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 873 ; AVX512CDVL-NEXT: retq 874 ; 875 ; AVX512CD-LABEL: foldv32i8: 876 ; AVX512CD: # BB#0: 877 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 878 ; AVX512CD-NEXT: retq 879 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) 880 ret <32 x i8> %out 881 } 882 883 define <32 x i8> @foldv32i8u() nounwind { 884 ; AVX1-LABEL: foldv32i8u: 885 ; AVX1: # BB#0: 886 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 887 ; AVX1-NEXT: retq 888 ; 889 ; AVX2-LABEL: foldv32i8u: 890 ; AVX2: # BB#0: 891 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 892 ; AVX2-NEXT: retq 893 ; 894 ; AVX512CDVL-LABEL: foldv32i8u: 895 ; AVX512CDVL: # BB#0: 896 ; AVX512CDVL-NEXT: vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 897 ; AVX512CDVL-NEXT: retq 898 ; 899 ; AVX512CD-LABEL: foldv32i8u: 900 ; AVX512CD: # BB#0: 901 ; AVX512CD-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 902 ; AVX512CD-NEXT: retq 903 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) 904 ret <32 x i8> %out 905 } 906 907 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) 908 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1) 909 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) 910 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1) 911