1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 4 5 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { 6 ; AVX1-LABEL: testv4i64: 7 ; AVX1: # BB#0: 8 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 9 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 10 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 11 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 12 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 13 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 14 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 15 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] 16 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 17 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 18 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 19 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 20 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 21 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 22 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 23 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 24 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 25 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 26 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 27 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 28 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 29 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 30 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 31 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 32 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 33 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 34 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 35 ; AVX1-NEXT: retq 36 ; 37 ; AVX2-LABEL: testv4i64: 38 ; AVX2: # BB#0: 39 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 40 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 41 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 42 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 43 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 44 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 45 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 46 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 47 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 48 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 49 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 50 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 51 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 52 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 53 ; AVX2-NEXT: retq 54 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) 55 ret <4 x i64> %out 56 } 57 58 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { 59 ; AVX1-LABEL: testv4i64u: 60 ; AVX1: # BB#0: 61 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 62 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 63 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 64 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3 65 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 66 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 67 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 68 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] 69 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 70 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 71 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 72 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 73 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 74 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 75 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 76 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 77 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 78 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 79 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 80 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 81 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 82 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 83 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 84 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 85 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 86 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 87 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 88 ; AVX1-NEXT: retq 89 ; 90 ; AVX2-LABEL: testv4i64u: 91 ; AVX2: # BB#0: 92 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 93 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 94 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 95 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 96 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 97 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 98 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 99 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 100 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 101 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 102 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 103 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 104 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 105 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 106 ; AVX2-NEXT: retq 107 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) 108 ret <4 x i64> %out 109 } 110 111 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { 112 ; AVX1-LABEL: testv8i32: 113 ; AVX1: # BB#0: 114 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 115 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 116 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 117 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 118 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 119 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 120 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 121 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] 122 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 123 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 124 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 125 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 126 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 127 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 128 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 129 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 130 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 131 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 132 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 133 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 134 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 135 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 136 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 137 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 138 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 139 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 140 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 141 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 142 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 143 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 144 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 145 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 146 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 147 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 148 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 149 ; AVX1-NEXT: retq 150 ; 151 ; AVX2-LABEL: testv8i32: 152 ; AVX2: # BB#0: 153 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 154 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 155 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 156 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 157 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 158 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 159 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 160 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 161 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 162 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 163 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 164 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 165 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 166 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 167 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 168 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 169 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 170 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 171 ; AVX2-NEXT: retq 172 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) 173 ret <8 x i32> %out 174 } 175 176 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { 177 ; AVX1-LABEL: testv8i32u: 178 ; AVX1: # BB#0: 179 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 180 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 181 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2 182 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3 183 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 184 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 185 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 186 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] 187 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2 188 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 189 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 190 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 191 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 192 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 193 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 194 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 195 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 196 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 197 ; AVX1-NEXT: vpsadbw %xmm1, %xmm5, %xmm5 198 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 199 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 200 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 201 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 202 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 203 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 204 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 205 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 206 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 207 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 208 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 209 ; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm3 210 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 211 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 212 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 213 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 214 ; AVX1-NEXT: retq 215 ; 216 ; AVX2-LABEL: testv8i32u: 217 ; AVX2: # BB#0: 218 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 219 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 220 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 221 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 222 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 223 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 224 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 225 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 226 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 227 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 228 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 229 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 230 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 231 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 232 ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 233 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 234 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 235 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 236 ; AVX2-NEXT: retq 237 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1) 238 ret <8 x i32> %out 239 } 240 241 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { 242 ; AVX1-LABEL: testv16i16: 243 ; AVX1: # BB#0: 244 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 245 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 246 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 247 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 248 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 249 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 250 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] 251 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 252 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 253 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 254 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 255 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 256 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 257 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 258 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 259 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 260 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 261 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 262 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 263 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 264 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 265 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 266 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 267 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 268 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 269 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 270 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 271 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 272 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 273 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 274 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 275 ; AVX1-NEXT: retq 276 ; 277 ; AVX2-LABEL: testv16i16: 278 ; AVX2: # BB#0: 279 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 280 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 281 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 282 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 283 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 284 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 285 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 286 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 287 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 288 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 289 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 290 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 291 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 292 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 293 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 294 ; AVX2-NEXT: retq 295 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0) 296 ret <16 x i16> %out 297 } 298 299 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { 300 ; AVX1-LABEL: testv16i16u: 301 ; AVX1: # BB#0: 302 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 303 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 304 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 305 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2 306 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 307 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 308 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] 309 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 310 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 311 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 312 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 313 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 314 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 315 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 316 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 317 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 318 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 319 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 320 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 321 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 322 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 323 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 324 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 325 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 326 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 327 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 328 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 329 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 330 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 331 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 332 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 333 ; AVX1-NEXT: retq 334 ; 335 ; AVX2-LABEL: testv16i16u: 336 ; AVX2: # BB#0: 337 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 338 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 339 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 340 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0 341 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 342 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 343 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 344 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 345 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 346 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 347 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 348 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 349 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 350 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 351 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 352 ; AVX2-NEXT: retq 353 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1) 354 ret <16 x i16> %out 355 } 356 357 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { 358 ; AVX1-LABEL: testv32i8: 359 ; AVX1: # BB#0: 360 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 361 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 362 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 363 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 364 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 365 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 366 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 367 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 368 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 369 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 370 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 371 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 372 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 373 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 374 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 375 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 376 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 377 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 378 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 379 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 380 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 381 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 382 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 383 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 384 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 385 ; AVX1-NEXT: retq 386 ; 387 ; AVX2-LABEL: testv32i8: 388 ; AVX2: # BB#0: 389 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 390 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 391 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 392 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 393 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 394 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 395 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 396 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 397 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 398 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 399 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 400 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 401 ; AVX2-NEXT: retq 402 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0) 403 ret <32 x i8> %out 404 } 405 406 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { 407 ; AVX1-LABEL: testv32i8u: 408 ; AVX1: # BB#0: 409 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 410 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 411 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 412 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 413 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 414 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 415 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 416 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 417 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 418 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 419 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 420 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 421 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 422 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 423 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 424 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 425 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 426 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 427 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 428 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 429 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 430 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 431 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 432 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 433 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 434 ; AVX1-NEXT: retq 435 ; 436 ; AVX2-LABEL: testv32i8u: 437 ; AVX2: # BB#0: 438 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 439 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 440 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 441 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0 442 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 443 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 444 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 445 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 446 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 447 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 448 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 449 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 450 ; AVX2-NEXT: retq 451 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1) 452 ret <32 x i8> %out 453 } 454 455 define <4 x i64> @foldv4i64() nounwind { 456 ; ALL-LABEL: foldv4i64: 457 ; ALL: # BB#0: 458 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 459 ; ALL-NEXT: retq 460 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) 461 ret <4 x i64> %out 462 } 463 464 define <4 x i64> @foldv4i64u() nounwind { 465 ; ALL-LABEL: foldv4i64u: 466 ; ALL: # BB#0: 467 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] 468 ; ALL-NEXT: retq 469 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) 470 ret <4 x i64> %out 471 } 472 473 define <8 x i32> @foldv8i32() nounwind { 474 ; ALL-LABEL: foldv8i32: 475 ; ALL: # BB#0: 476 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 477 ; ALL-NEXT: retq 478 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) 479 ret <8 x i32> %out 480 } 481 482 define <8 x i32> @foldv8i32u() nounwind { 483 ; ALL-LABEL: foldv8i32u: 484 ; ALL: # BB#0: 485 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] 486 ; ALL-NEXT: retq 487 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) 488 ret <8 x i32> %out 489 } 490 491 define <16 x i16> @foldv16i16() nounwind { 492 ; ALL-LABEL: foldv16i16: 493 ; ALL: # BB#0: 494 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 495 ; ALL-NEXT: retq 496 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) 497 ret <16 x i16> %out 498 } 499 500 define <16 x i16> @foldv16i16u() nounwind { 501 ; ALL-LABEL: foldv16i16u: 502 ; ALL: # BB#0: 503 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] 504 ; ALL-NEXT: retq 505 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) 506 ret <16 x i16> %out 507 } 508 509 define <32 x i8> @foldv32i8() nounwind { 510 ; ALL-LABEL: foldv32i8: 511 ; ALL: # BB#0: 512 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 513 ; ALL-NEXT: retq 514 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) 515 ret <32 x i8> %out 516 } 517 518 define <32 x i8> @foldv32i8u() nounwind { 519 ; ALL-LABEL: foldv32i8u: 520 ; ALL: # BB#0: 521 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] 522 ; ALL-NEXT: retq 523 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) 524 ret <32 x i8> %out 525 } 526 527 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) 528 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1) 529 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) 530 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1) 531