1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 8 9 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 10 ; SSE2-LABEL: testv2i64: 11 ; SSE2: # BB#0: 12 ; SSE2-NEXT: movdqa %xmm0, %xmm1 13 ; SSE2-NEXT: psrlq $1, %xmm1 14 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 15 ; SSE2-NEXT: psubq %xmm1, %xmm0 16 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] 17 ; SSE2-NEXT: movdqa %xmm0, %xmm2 18 ; SSE2-NEXT: pand %xmm1, %xmm2 19 ; SSE2-NEXT: psrlq $2, %xmm0 20 ; SSE2-NEXT: pand %xmm1, %xmm0 21 ; SSE2-NEXT: paddq %xmm2, %xmm0 22 ; SSE2-NEXT: movdqa %xmm0, %xmm1 23 ; SSE2-NEXT: psrlq $4, %xmm1 24 ; SSE2-NEXT: paddq %xmm0, %xmm1 25 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 26 ; SSE2-NEXT: pxor %xmm0, %xmm0 27 ; SSE2-NEXT: psadbw %xmm0, %xmm1 28 ; SSE2-NEXT: movdqa %xmm1, %xmm0 29 ; SSE2-NEXT: retq 30 ; 31 ; SSE3-LABEL: testv2i64: 32 ; SSE3: # BB#0: 33 ; SSE3-NEXT: movdqa %xmm0, %xmm1 34 ; SSE3-NEXT: psrlq $1, %xmm1 35 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 36 ; SSE3-NEXT: psubq %xmm1, %xmm0 37 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] 38 ; SSE3-NEXT: movdqa %xmm0, %xmm2 39 ; SSE3-NEXT: pand %xmm1, %xmm2 40 ; SSE3-NEXT: psrlq $2, %xmm0 41 ; SSE3-NEXT: pand %xmm1, %xmm0 42 ; SSE3-NEXT: paddq %xmm2, %xmm0 43 ; SSE3-NEXT: movdqa %xmm0, %xmm1 44 ; SSE3-NEXT: psrlq $4, %xmm1 45 ; SSE3-NEXT: paddq %xmm0, %xmm1 46 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 47 ; SSE3-NEXT: pxor %xmm0, %xmm0 48 ; SSE3-NEXT: psadbw %xmm0, %xmm1 49 ; SSE3-NEXT: movdqa %xmm1, %xmm0 50 ; SSE3-NEXT: retq 51 ; 52 ; SSSE3-LABEL: testv2i64: 53 ; SSSE3: # BB#0: 54 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 55 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 56 ; SSSE3-NEXT: pand %xmm1, %xmm2 57 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 58 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 59 ; SSSE3-NEXT: pshufb %xmm2, %xmm4 60 ; SSSE3-NEXT: psrlw $4, %xmm0 61 ; SSSE3-NEXT: pand %xmm1, %xmm0 62 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 63 ; SSSE3-NEXT: paddb %xmm4, %xmm3 64 ; SSSE3-NEXT: pxor %xmm0, %xmm0 65 ; SSSE3-NEXT: psadbw %xmm3, %xmm0 66 ; SSSE3-NEXT: retq 67 ; 68 ; SSE41-LABEL: testv2i64: 69 ; SSE41: # BB#0: 70 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 71 ; SSE41-NEXT: movdqa %xmm0, %xmm2 72 ; SSE41-NEXT: pand %xmm1, %xmm2 73 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 74 ; SSE41-NEXT: movdqa %xmm3, %xmm4 75 ; SSE41-NEXT: pshufb %xmm2, %xmm4 76 ; SSE41-NEXT: psrlw $4, %xmm0 77 ; SSE41-NEXT: pand %xmm1, %xmm0 78 ; SSE41-NEXT: pshufb %xmm0, %xmm3 79 ; SSE41-NEXT: paddb %xmm4, %xmm3 80 ; SSE41-NEXT: pxor %xmm0, %xmm0 81 ; SSE41-NEXT: psadbw %xmm3, %xmm0 82 ; SSE41-NEXT: retq 83 ; 84 ; AVX-LABEL: testv2i64: 85 ; AVX: # BB#0: 86 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 87 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 88 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 89 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 90 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 91 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 92 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 93 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 94 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 95 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 96 ; AVX-NEXT: retq 97 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) 98 ret <2 x i64> %out 99 } 100 101 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 102 ; SSE2-LABEL: testv4i32: 103 ; SSE2: # BB#0: 104 ; SSE2-NEXT: movdqa %xmm0, %xmm1 105 ; SSE2-NEXT: psrld $1, %xmm1 106 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 107 ; SSE2-NEXT: psubd %xmm1, %xmm0 108 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] 109 ; SSE2-NEXT: movdqa %xmm0, %xmm2 110 ; SSE2-NEXT: pand %xmm1, %xmm2 111 ; SSE2-NEXT: psrld $2, %xmm0 112 ; SSE2-NEXT: pand %xmm1, %xmm0 113 ; SSE2-NEXT: paddd %xmm2, %xmm0 114 ; SSE2-NEXT: movdqa %xmm0, %xmm1 115 ; SSE2-NEXT: psrld $4, %xmm1 116 ; SSE2-NEXT: paddd %xmm0, %xmm1 117 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 118 ; SSE2-NEXT: pxor %xmm0, %xmm0 119 ; SSE2-NEXT: movdqa %xmm1, %xmm2 120 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 121 ; SSE2-NEXT: psadbw %xmm0, %xmm2 122 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 123 ; SSE2-NEXT: psadbw %xmm0, %xmm1 124 ; SSE2-NEXT: packuswb %xmm2, %xmm1 125 ; SSE2-NEXT: movdqa %xmm1, %xmm0 126 ; SSE2-NEXT: retq 127 ; 128 ; SSE3-LABEL: testv4i32: 129 ; SSE3: # BB#0: 130 ; SSE3-NEXT: movdqa %xmm0, %xmm1 131 ; SSE3-NEXT: psrld $1, %xmm1 132 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 133 ; SSE3-NEXT: psubd %xmm1, %xmm0 134 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] 135 ; SSE3-NEXT: movdqa %xmm0, %xmm2 136 ; SSE3-NEXT: pand %xmm1, %xmm2 137 ; SSE3-NEXT: psrld $2, %xmm0 138 ; SSE3-NEXT: pand %xmm1, %xmm0 139 ; SSE3-NEXT: paddd %xmm2, %xmm0 140 ; SSE3-NEXT: movdqa %xmm0, %xmm1 141 ; SSE3-NEXT: psrld $4, %xmm1 142 ; SSE3-NEXT: paddd %xmm0, %xmm1 143 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 144 ; SSE3-NEXT: pxor %xmm0, %xmm0 145 ; SSE3-NEXT: movdqa %xmm1, %xmm2 146 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 147 ; SSE3-NEXT: psadbw %xmm0, %xmm2 148 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 149 ; SSE3-NEXT: psadbw %xmm0, %xmm1 150 ; SSE3-NEXT: packuswb %xmm2, %xmm1 151 ; SSE3-NEXT: movdqa %xmm1, %xmm0 152 ; SSE3-NEXT: retq 153 ; 154 ; SSSE3-LABEL: testv4i32: 155 ; SSSE3: # BB#0: 156 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 157 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 158 ; SSSE3-NEXT: pand %xmm2, %xmm3 159 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 160 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 161 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 162 ; SSSE3-NEXT: psrlw $4, %xmm0 163 ; SSSE3-NEXT: pand %xmm2, %xmm0 164 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 165 ; SSSE3-NEXT: paddb %xmm4, %xmm1 166 ; SSSE3-NEXT: pxor %xmm0, %xmm0 167 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 168 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 169 ; SSSE3-NEXT: psadbw %xmm0, %xmm2 170 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 171 ; SSSE3-NEXT: psadbw %xmm0, %xmm1 172 ; SSSE3-NEXT: packuswb %xmm2, %xmm1 173 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 174 ; SSSE3-NEXT: retq 175 ; 176 ; SSE41-LABEL: testv4i32: 177 ; SSE41: # BB#0: 178 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 179 ; SSE41-NEXT: movdqa %xmm0, %xmm3 180 ; SSE41-NEXT: pand %xmm2, %xmm3 181 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 182 ; SSE41-NEXT: movdqa %xmm1, %xmm4 183 ; SSE41-NEXT: pshufb %xmm3, %xmm4 184 ; SSE41-NEXT: psrlw $4, %xmm0 185 ; SSE41-NEXT: pand %xmm2, %xmm0 186 ; SSE41-NEXT: pshufb %xmm0, %xmm1 187 ; SSE41-NEXT: paddb %xmm4, %xmm1 188 ; SSE41-NEXT: pxor %xmm0, %xmm0 189 ; SSE41-NEXT: movdqa %xmm1, %xmm2 190 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 191 ; SSE41-NEXT: psadbw %xmm0, %xmm2 192 ; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 193 ; SSE41-NEXT: psadbw %xmm0, %xmm1 194 ; SSE41-NEXT: packuswb %xmm2, %xmm1 195 ; SSE41-NEXT: movdqa %xmm1, %xmm0 196 ; SSE41-NEXT: retq 197 ; 198 ; AVX-LABEL: testv4i32: 199 ; AVX: # BB#0: 200 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 201 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 202 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 203 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 204 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 205 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 206 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 207 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 208 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 209 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 210 ; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 211 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 212 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 213 ; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 214 ; AVX-NEXT: retq 215 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) 216 ret <4 x i32> %out 217 } 218 219 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 220 ; SSE2-LABEL: testv8i16: 221 ; SSE2: # BB#0: 222 ; SSE2-NEXT: movdqa %xmm0, %xmm1 223 ; SSE2-NEXT: psrlw $1, %xmm1 224 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 225 ; SSE2-NEXT: psubw %xmm1, %xmm0 226 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] 227 ; SSE2-NEXT: movdqa %xmm0, %xmm2 228 ; SSE2-NEXT: pand %xmm1, %xmm2 229 ; SSE2-NEXT: psrlw $2, %xmm0 230 ; SSE2-NEXT: pand %xmm1, %xmm0 231 ; SSE2-NEXT: paddw %xmm2, %xmm0 232 ; SSE2-NEXT: movdqa %xmm0, %xmm1 233 ; SSE2-NEXT: psrlw $4, %xmm1 234 ; SSE2-NEXT: paddw %xmm0, %xmm1 235 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 236 ; SSE2-NEXT: movdqa %xmm1, %xmm0 237 ; SSE2-NEXT: psllw $8, %xmm0 238 ; SSE2-NEXT: paddb %xmm1, %xmm0 239 ; SSE2-NEXT: psrlw $8, %xmm0 240 ; SSE2-NEXT: retq 241 ; 242 ; SSE3-LABEL: testv8i16: 243 ; SSE3: # BB#0: 244 ; SSE3-NEXT: movdqa %xmm0, %xmm1 245 ; SSE3-NEXT: psrlw $1, %xmm1 246 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 247 ; SSE3-NEXT: psubw %xmm1, %xmm0 248 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] 249 ; SSE3-NEXT: movdqa %xmm0, %xmm2 250 ; SSE3-NEXT: pand %xmm1, %xmm2 251 ; SSE3-NEXT: psrlw $2, %xmm0 252 ; SSE3-NEXT: pand %xmm1, %xmm0 253 ; SSE3-NEXT: paddw %xmm2, %xmm0 254 ; SSE3-NEXT: movdqa %xmm0, %xmm1 255 ; SSE3-NEXT: psrlw $4, %xmm1 256 ; SSE3-NEXT: paddw %xmm0, %xmm1 257 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 258 ; SSE3-NEXT: movdqa %xmm1, %xmm0 259 ; SSE3-NEXT: psllw $8, %xmm0 260 ; SSE3-NEXT: paddb %xmm1, %xmm0 261 ; SSE3-NEXT: psrlw $8, %xmm0 262 ; SSE3-NEXT: retq 263 ; 264 ; SSSE3-LABEL: testv8i16: 265 ; SSSE3: # BB#0: 266 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 267 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 268 ; SSSE3-NEXT: pand %xmm1, %xmm2 269 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 270 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 271 ; SSSE3-NEXT: pshufb %xmm2, %xmm4 272 ; SSSE3-NEXT: psrlw $4, %xmm0 273 ; SSSE3-NEXT: pand %xmm1, %xmm0 274 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 275 ; SSSE3-NEXT: paddb %xmm4, %xmm3 276 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 277 ; SSSE3-NEXT: psllw $8, %xmm0 278 ; SSSE3-NEXT: paddb %xmm3, %xmm0 279 ; SSSE3-NEXT: psrlw $8, %xmm0 280 ; SSSE3-NEXT: retq 281 ; 282 ; SSE41-LABEL: testv8i16: 283 ; SSE41: # BB#0: 284 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 285 ; SSE41-NEXT: movdqa %xmm0, %xmm2 286 ; SSE41-NEXT: pand %xmm1, %xmm2 287 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 288 ; SSE41-NEXT: movdqa %xmm3, %xmm4 289 ; SSE41-NEXT: pshufb %xmm2, %xmm4 290 ; SSE41-NEXT: psrlw $4, %xmm0 291 ; SSE41-NEXT: pand %xmm1, %xmm0 292 ; SSE41-NEXT: pshufb %xmm0, %xmm3 293 ; SSE41-NEXT: paddb %xmm4, %xmm3 294 ; SSE41-NEXT: movdqa %xmm3, %xmm0 295 ; SSE41-NEXT: psllw $8, %xmm0 296 ; SSE41-NEXT: paddb %xmm3, %xmm0 297 ; SSE41-NEXT: psrlw $8, %xmm0 298 ; SSE41-NEXT: retq 299 ; 300 ; AVX-LABEL: testv8i16: 301 ; AVX: # BB#0: 302 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 303 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 304 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 305 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 306 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 307 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 308 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 309 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 310 ; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 311 ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 312 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 313 ; AVX-NEXT: retq 314 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) 315 ret <8 x i16> %out 316 } 317 318 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 319 ; SSE2-LABEL: testv16i8: 320 ; SSE2: # BB#0: 321 ; SSE2-NEXT: movdqa %xmm0, %xmm1 322 ; SSE2-NEXT: psrlw $1, %xmm1 323 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 324 ; SSE2-NEXT: psubb %xmm1, %xmm0 325 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 326 ; SSE2-NEXT: movdqa %xmm0, %xmm2 327 ; SSE2-NEXT: pand %xmm1, %xmm2 328 ; SSE2-NEXT: psrlw $2, %xmm0 329 ; SSE2-NEXT: pand %xmm1, %xmm0 330 ; SSE2-NEXT: paddb %xmm2, %xmm0 331 ; SSE2-NEXT: movdqa %xmm0, %xmm1 332 ; SSE2-NEXT: psrlw $4, %xmm1 333 ; SSE2-NEXT: paddb %xmm0, %xmm1 334 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 335 ; SSE2-NEXT: movdqa %xmm1, %xmm0 336 ; SSE2-NEXT: retq 337 ; 338 ; SSE3-LABEL: testv16i8: 339 ; SSE3: # BB#0: 340 ; SSE3-NEXT: movdqa %xmm0, %xmm1 341 ; SSE3-NEXT: psrlw $1, %xmm1 342 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 343 ; SSE3-NEXT: psubb %xmm1, %xmm0 344 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 345 ; SSE3-NEXT: movdqa %xmm0, %xmm2 346 ; SSE3-NEXT: pand %xmm1, %xmm2 347 ; SSE3-NEXT: psrlw $2, %xmm0 348 ; SSE3-NEXT: pand %xmm1, %xmm0 349 ; SSE3-NEXT: paddb %xmm2, %xmm0 350 ; SSE3-NEXT: movdqa %xmm0, %xmm1 351 ; SSE3-NEXT: psrlw $4, %xmm1 352 ; SSE3-NEXT: paddb %xmm0, %xmm1 353 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 354 ; SSE3-NEXT: movdqa %xmm1, %xmm0 355 ; SSE3-NEXT: retq 356 ; 357 ; SSSE3-LABEL: testv16i8: 358 ; SSSE3: # BB#0: 359 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 360 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 361 ; SSSE3-NEXT: pand %xmm2, %xmm3 362 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 363 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 364 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 365 ; SSSE3-NEXT: psrlw $4, %xmm0 366 ; SSSE3-NEXT: pand %xmm2, %xmm0 367 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 368 ; SSSE3-NEXT: paddb %xmm4, %xmm1 369 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 370 ; SSSE3-NEXT: retq 371 ; 372 ; SSE41-LABEL: testv16i8: 373 ; SSE41: # BB#0: 374 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 375 ; SSE41-NEXT: movdqa %xmm0, %xmm3 376 ; SSE41-NEXT: pand %xmm2, %xmm3 377 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 378 ; SSE41-NEXT: movdqa %xmm1, %xmm4 379 ; SSE41-NEXT: pshufb %xmm3, %xmm4 380 ; SSE41-NEXT: psrlw $4, %xmm0 381 ; SSE41-NEXT: pand %xmm2, %xmm0 382 ; SSE41-NEXT: pshufb %xmm0, %xmm1 383 ; SSE41-NEXT: paddb %xmm4, %xmm1 384 ; SSE41-NEXT: movdqa %xmm1, %xmm0 385 ; SSE41-NEXT: retq 386 ; 387 ; AVX-LABEL: testv16i8: 388 ; AVX: # BB#0: 389 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 390 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 391 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 392 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 393 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 394 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 395 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 396 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 397 ; AVX-NEXT: retq 398 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) 399 ret <16 x i8> %out 400 } 401 402 define <2 x i64> @foldv2i64() nounwind { 403 ; SSE-LABEL: foldv2i64: 404 ; SSE: # BB#0: 405 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64] 406 ; SSE-NEXT: retq 407 ; 408 ; AVX-LABEL: foldv2i64: 409 ; AVX: # BB#0: 410 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] 411 ; AVX-NEXT: retq 412 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>) 413 ret <2 x i64> %out 414 } 415 416 define <4 x i32> @foldv4i32() nounwind { 417 ; SSE-LABEL: foldv4i32: 418 ; SSE: # BB#0: 419 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8] 420 ; SSE-NEXT: retq 421 ; 422 ; AVX-LABEL: foldv4i32: 423 ; AVX: # BB#0: 424 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] 425 ; AVX-NEXT: retq 426 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>) 427 ret <4 x i32> %out 428 } 429 430 define <8 x i16> @foldv8i16() nounwind { 431 ; SSE-LABEL: foldv8i16: 432 ; SSE: # BB#0: 433 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 434 ; SSE-NEXT: retq 435 ; 436 ; AVX-LABEL: foldv8i16: 437 ; AVX: # BB#0: 438 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 439 ; AVX-NEXT: retq 440 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>) 441 ret <8 x i16> %out 442 } 443 444 define <16 x i8> @foldv16i8() nounwind { 445 ; SSE-LABEL: foldv16i8: 446 ; SSE: # BB#0: 447 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 448 ; SSE-NEXT: retq 449 ; 450 ; AVX-LABEL: foldv16i8: 451 ; AVX: # BB#0: 452 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 453 ; AVX-NEXT: retq 454 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>) 455 ret <16 x i8> %out 456 } 457 458 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) 459 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) 460 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) 461 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) 462