1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL 10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX 11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG 12 13 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { 14 ; SSE2-LABEL: testv2i64: 15 ; SSE2: # %bb.0: 16 ; SSE2-NEXT: movdqa %xmm0, %xmm1 17 ; SSE2-NEXT: psrlq $1, %xmm1 18 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 19 ; SSE2-NEXT: psubq %xmm1, %xmm0 20 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] 21 ; SSE2-NEXT: movdqa %xmm0, %xmm2 22 ; SSE2-NEXT: pand %xmm1, %xmm2 23 ; SSE2-NEXT: psrlq $2, %xmm0 24 ; SSE2-NEXT: pand %xmm1, %xmm0 25 ; SSE2-NEXT: paddq %xmm2, %xmm0 26 ; SSE2-NEXT: movdqa %xmm0, %xmm1 27 ; SSE2-NEXT: psrlq $4, %xmm1 28 ; SSE2-NEXT: paddq %xmm0, %xmm1 29 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 30 ; SSE2-NEXT: pxor %xmm0, %xmm0 31 ; SSE2-NEXT: psadbw %xmm0, %xmm1 32 ; SSE2-NEXT: movdqa %xmm1, %xmm0 33 ; SSE2-NEXT: retq 34 ; 35 ; SSE3-LABEL: testv2i64: 36 ; SSE3: # %bb.0: 37 ; SSE3-NEXT: movdqa %xmm0, %xmm1 38 ; SSE3-NEXT: psrlq $1, %xmm1 39 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 40 ; SSE3-NEXT: psubq %xmm1, %xmm0 41 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] 42 ; SSE3-NEXT: movdqa %xmm0, %xmm2 43 ; SSE3-NEXT: pand %xmm1, %xmm2 44 ; SSE3-NEXT: psrlq $2, %xmm0 45 ; SSE3-NEXT: pand %xmm1, %xmm0 46 ; SSE3-NEXT: paddq %xmm2, %xmm0 47 ; SSE3-NEXT: movdqa %xmm0, %xmm1 48 ; SSE3-NEXT: psrlq $4, %xmm1 49 ; SSE3-NEXT: paddq %xmm0, %xmm1 50 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 51 ; SSE3-NEXT: pxor %xmm0, %xmm0 52 ; SSE3-NEXT: psadbw %xmm0, %xmm1 53 ; SSE3-NEXT: movdqa %xmm1, %xmm0 54 ; SSE3-NEXT: retq 55 ; 56 ; SSSE3-LABEL: testv2i64: 57 ; SSSE3: # %bb.0: 58 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 59 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 60 ; SSSE3-NEXT: pand %xmm1, %xmm2 61 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 62 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 63 ; SSSE3-NEXT: pshufb %xmm2, %xmm4 64 ; SSSE3-NEXT: psrlw $4, %xmm0 65 ; SSSE3-NEXT: pand %xmm1, %xmm0 66 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 67 ; SSSE3-NEXT: paddb %xmm4, %xmm3 68 ; SSSE3-NEXT: pxor %xmm0, %xmm0 69 ; SSSE3-NEXT: psadbw %xmm3, %xmm0 70 ; SSSE3-NEXT: retq 71 ; 72 ; SSE41-LABEL: testv2i64: 73 ; SSE41: # %bb.0: 74 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 75 ; SSE41-NEXT: movdqa %xmm0, %xmm2 76 ; SSE41-NEXT: pand %xmm1, %xmm2 77 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 78 ; SSE41-NEXT: movdqa %xmm3, %xmm4 79 ; SSE41-NEXT: pshufb %xmm2, %xmm4 80 ; SSE41-NEXT: psrlw $4, %xmm0 81 ; SSE41-NEXT: pand %xmm1, %xmm0 82 ; SSE41-NEXT: pshufb %xmm0, %xmm3 83 ; SSE41-NEXT: paddb %xmm4, %xmm3 84 ; SSE41-NEXT: pxor %xmm0, %xmm0 85 ; SSE41-NEXT: psadbw %xmm3, %xmm0 86 ; SSE41-NEXT: retq 87 ; 88 ; AVX1-LABEL: testv2i64: 89 ; AVX1: # %bb.0: 90 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 91 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 92 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 93 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 94 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 95 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 96 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 97 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 98 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 99 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 100 ; AVX1-NEXT: retq 101 ; 102 ; AVX2-LABEL: testv2i64: 103 ; AVX2: # %bb.0: 104 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 105 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 106 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 107 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 108 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 109 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 110 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 111 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 112 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 113 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 114 ; AVX2-NEXT: retq 115 ; 116 ; AVX512VPOPCNTDQ-LABEL: testv2i64: 117 ; AVX512VPOPCNTDQ: # %bb.0: 118 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 119 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 120 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 121 ; AVX512VPOPCNTDQ-NEXT: vzeroupper 122 ; AVX512VPOPCNTDQ-NEXT: retq 123 ; 124 ; AVX512VPOPCNTDQVL-LABEL: testv2i64: 125 ; AVX512VPOPCNTDQVL: # %bb.0: 126 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 127 ; AVX512VPOPCNTDQVL-NEXT: retq 128 ; 129 ; BITALG_NOVLX-LABEL: testv2i64: 130 ; BITALG_NOVLX: # %bb.0: 131 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 132 ; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2 133 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 134 ; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 135 ; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 136 ; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 137 ; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 138 ; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 139 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 140 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 141 ; BITALG_NOVLX-NEXT: retq 142 ; 143 ; BITALG-LABEL: testv2i64: 144 ; BITALG: # %bb.0: 145 ; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 146 ; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2 147 ; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 148 ; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2 149 ; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 150 ; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 151 ; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0 152 ; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0 153 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 154 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 155 ; BITALG-NEXT: retq 156 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) 157 ret <2 x i64> %out 158 } 159 160 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { 161 ; SSE2-LABEL: testv4i32: 162 ; SSE2: # %bb.0: 163 ; SSE2-NEXT: movdqa %xmm0, %xmm1 164 ; SSE2-NEXT: psrld $1, %xmm1 165 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 166 ; SSE2-NEXT: psubd %xmm1, %xmm0 167 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] 168 ; SSE2-NEXT: movdqa %xmm0, %xmm2 169 ; SSE2-NEXT: pand %xmm1, %xmm2 170 ; SSE2-NEXT: psrld $2, %xmm0 171 ; SSE2-NEXT: pand %xmm1, %xmm0 172 ; SSE2-NEXT: paddd %xmm2, %xmm0 173 ; SSE2-NEXT: movdqa %xmm0, %xmm1 174 ; SSE2-NEXT: psrld $4, %xmm1 175 ; SSE2-NEXT: paddd %xmm0, %xmm1 176 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 177 ; SSE2-NEXT: pxor %xmm0, %xmm0 178 ; SSE2-NEXT: movdqa %xmm1, %xmm2 179 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 180 ; SSE2-NEXT: psadbw %xmm0, %xmm2 181 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 182 ; SSE2-NEXT: psadbw %xmm0, %xmm1 183 ; SSE2-NEXT: packuswb %xmm2, %xmm1 184 ; SSE2-NEXT: movdqa %xmm1, %xmm0 185 ; SSE2-NEXT: retq 186 ; 187 ; SSE3-LABEL: testv4i32: 188 ; SSE3: # %bb.0: 189 ; SSE3-NEXT: movdqa %xmm0, %xmm1 190 ; SSE3-NEXT: psrld $1, %xmm1 191 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 192 ; SSE3-NEXT: psubd %xmm1, %xmm0 193 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] 194 ; SSE3-NEXT: movdqa %xmm0, %xmm2 195 ; SSE3-NEXT: pand %xmm1, %xmm2 196 ; SSE3-NEXT: psrld $2, %xmm0 197 ; SSE3-NEXT: pand %xmm1, %xmm0 198 ; SSE3-NEXT: paddd %xmm2, %xmm0 199 ; SSE3-NEXT: movdqa %xmm0, %xmm1 200 ; SSE3-NEXT: psrld $4, %xmm1 201 ; SSE3-NEXT: paddd %xmm0, %xmm1 202 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 203 ; SSE3-NEXT: pxor %xmm0, %xmm0 204 ; SSE3-NEXT: movdqa %xmm1, %xmm2 205 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 206 ; SSE3-NEXT: psadbw %xmm0, %xmm2 207 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 208 ; SSE3-NEXT: psadbw %xmm0, %xmm1 209 ; SSE3-NEXT: packuswb %xmm2, %xmm1 210 ; SSE3-NEXT: movdqa %xmm1, %xmm0 211 ; SSE3-NEXT: retq 212 ; 213 ; SSSE3-LABEL: testv4i32: 214 ; SSSE3: # %bb.0: 215 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 216 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 217 ; SSSE3-NEXT: pand %xmm2, %xmm3 218 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 219 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 220 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 221 ; SSSE3-NEXT: psrlw $4, %xmm0 222 ; SSSE3-NEXT: pand %xmm2, %xmm0 223 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 224 ; SSSE3-NEXT: paddb %xmm4, %xmm1 225 ; SSSE3-NEXT: pxor %xmm0, %xmm0 226 ; SSSE3-NEXT: movdqa %xmm1, %xmm2 227 ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] 228 ; SSSE3-NEXT: psadbw %xmm0, %xmm2 229 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 230 ; SSSE3-NEXT: psadbw %xmm0, %xmm1 231 ; SSSE3-NEXT: packuswb %xmm2, %xmm1 232 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 233 ; SSSE3-NEXT: retq 234 ; 235 ; SSE41-LABEL: testv4i32: 236 ; SSE41: # %bb.0: 237 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 238 ; SSE41-NEXT: movdqa %xmm0, %xmm2 239 ; SSE41-NEXT: pand %xmm1, %xmm2 240 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 241 ; SSE41-NEXT: movdqa %xmm3, %xmm4 242 ; SSE41-NEXT: pshufb %xmm2, %xmm4 243 ; SSE41-NEXT: psrlw $4, %xmm0 244 ; SSE41-NEXT: pand %xmm1, %xmm0 245 ; SSE41-NEXT: pshufb %xmm0, %xmm3 246 ; SSE41-NEXT: paddb %xmm4, %xmm3 247 ; SSE41-NEXT: pxor %xmm1, %xmm1 248 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero 249 ; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 250 ; SSE41-NEXT: psadbw %xmm1, %xmm3 251 ; SSE41-NEXT: psadbw %xmm1, %xmm0 252 ; SSE41-NEXT: packuswb %xmm3, %xmm0 253 ; SSE41-NEXT: retq 254 ; 255 ; AVX1-LABEL: testv4i32: 256 ; AVX1: # %bb.0: 257 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 258 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 259 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 260 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 261 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 262 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 263 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 264 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 265 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 266 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 267 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 268 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 269 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 270 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 271 ; AVX1-NEXT: retq 272 ; 273 ; AVX2-LABEL: testv4i32: 274 ; AVX2: # %bb.0: 275 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 276 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 277 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 278 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 279 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 280 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 281 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 282 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 283 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 284 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 285 ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 286 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 287 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 288 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 289 ; AVX2-NEXT: retq 290 ; 291 ; AVX512VPOPCNTDQ-LABEL: testv4i32: 292 ; AVX512VPOPCNTDQ: # %bb.0: 293 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 294 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 295 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 296 ; AVX512VPOPCNTDQ-NEXT: vzeroupper 297 ; AVX512VPOPCNTDQ-NEXT: retq 298 ; 299 ; AVX512VPOPCNTDQVL-LABEL: testv4i32: 300 ; AVX512VPOPCNTDQVL: # %bb.0: 301 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 302 ; AVX512VPOPCNTDQVL-NEXT: retq 303 ; 304 ; BITALG_NOVLX-LABEL: testv4i32: 305 ; BITALG_NOVLX: # %bb.0: 306 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 307 ; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2 308 ; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 309 ; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 310 ; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 311 ; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 312 ; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 313 ; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 314 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 315 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 316 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 317 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 318 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 319 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 320 ; BITALG_NOVLX-NEXT: retq 321 ; 322 ; BITALG-LABEL: testv4i32: 323 ; BITALG: # %bb.0: 324 ; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 325 ; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2 326 ; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 327 ; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2 328 ; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 329 ; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 330 ; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0 331 ; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0 332 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 333 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 334 ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 335 ; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 336 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 337 ; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 338 ; BITALG-NEXT: retq 339 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) 340 ret <4 x i32> %out 341 } 342 343 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { 344 ; SSE2-LABEL: testv8i16: 345 ; SSE2: # %bb.0: 346 ; SSE2-NEXT: movdqa %xmm0, %xmm1 347 ; SSE2-NEXT: psrlw $1, %xmm1 348 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 349 ; SSE2-NEXT: psubw %xmm1, %xmm0 350 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] 351 ; SSE2-NEXT: movdqa %xmm0, %xmm2 352 ; SSE2-NEXT: pand %xmm1, %xmm2 353 ; SSE2-NEXT: psrlw $2, %xmm0 354 ; SSE2-NEXT: pand %xmm1, %xmm0 355 ; SSE2-NEXT: paddw %xmm2, %xmm0 356 ; SSE2-NEXT: movdqa %xmm0, %xmm1 357 ; SSE2-NEXT: psrlw $4, %xmm1 358 ; SSE2-NEXT: paddw %xmm0, %xmm1 359 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 360 ; SSE2-NEXT: movdqa %xmm1, %xmm0 361 ; SSE2-NEXT: psllw $8, %xmm0 362 ; SSE2-NEXT: paddb %xmm1, %xmm0 363 ; SSE2-NEXT: psrlw $8, %xmm0 364 ; SSE2-NEXT: retq 365 ; 366 ; SSE3-LABEL: testv8i16: 367 ; SSE3: # %bb.0: 368 ; SSE3-NEXT: movdqa %xmm0, %xmm1 369 ; SSE3-NEXT: psrlw $1, %xmm1 370 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 371 ; SSE3-NEXT: psubw %xmm1, %xmm0 372 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] 373 ; SSE3-NEXT: movdqa %xmm0, %xmm2 374 ; SSE3-NEXT: pand %xmm1, %xmm2 375 ; SSE3-NEXT: psrlw $2, %xmm0 376 ; SSE3-NEXT: pand %xmm1, %xmm0 377 ; SSE3-NEXT: paddw %xmm2, %xmm0 378 ; SSE3-NEXT: movdqa %xmm0, %xmm1 379 ; SSE3-NEXT: psrlw $4, %xmm1 380 ; SSE3-NEXT: paddw %xmm0, %xmm1 381 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 382 ; SSE3-NEXT: movdqa %xmm1, %xmm0 383 ; SSE3-NEXT: psllw $8, %xmm0 384 ; SSE3-NEXT: paddb %xmm1, %xmm0 385 ; SSE3-NEXT: psrlw $8, %xmm0 386 ; SSE3-NEXT: retq 387 ; 388 ; SSSE3-LABEL: testv8i16: 389 ; SSSE3: # %bb.0: 390 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 391 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 392 ; SSSE3-NEXT: pand %xmm1, %xmm2 393 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 394 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 395 ; SSSE3-NEXT: pshufb %xmm2, %xmm4 396 ; SSSE3-NEXT: psrlw $4, %xmm0 397 ; SSSE3-NEXT: pand %xmm1, %xmm0 398 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 399 ; SSSE3-NEXT: paddb %xmm4, %xmm3 400 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 401 ; SSSE3-NEXT: psllw $8, %xmm0 402 ; SSSE3-NEXT: paddb %xmm3, %xmm0 403 ; SSSE3-NEXT: psrlw $8, %xmm0 404 ; SSSE3-NEXT: retq 405 ; 406 ; SSE41-LABEL: testv8i16: 407 ; SSE41: # %bb.0: 408 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 409 ; SSE41-NEXT: movdqa %xmm0, %xmm2 410 ; SSE41-NEXT: pand %xmm1, %xmm2 411 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 412 ; SSE41-NEXT: movdqa %xmm3, %xmm4 413 ; SSE41-NEXT: pshufb %xmm2, %xmm4 414 ; SSE41-NEXT: psrlw $4, %xmm0 415 ; SSE41-NEXT: pand %xmm1, %xmm0 416 ; SSE41-NEXT: pshufb %xmm0, %xmm3 417 ; SSE41-NEXT: paddb %xmm4, %xmm3 418 ; SSE41-NEXT: movdqa %xmm3, %xmm0 419 ; SSE41-NEXT: psllw $8, %xmm0 420 ; SSE41-NEXT: paddb %xmm3, %xmm0 421 ; SSE41-NEXT: psrlw $8, %xmm0 422 ; SSE41-NEXT: retq 423 ; 424 ; AVX1-LABEL: testv8i16: 425 ; AVX1: # %bb.0: 426 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 427 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 428 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 429 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 430 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 431 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 432 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 433 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 434 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 435 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 436 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 437 ; AVX1-NEXT: retq 438 ; 439 ; AVX2-LABEL: testv8i16: 440 ; AVX2: # %bb.0: 441 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 442 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 443 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 444 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 445 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 446 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 447 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 448 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 449 ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 450 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 451 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 452 ; AVX2-NEXT: retq 453 ; 454 ; AVX512VPOPCNTDQ-LABEL: testv8i16: 455 ; AVX512VPOPCNTDQ: # %bb.0: 456 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 457 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 458 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 459 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 460 ; AVX512VPOPCNTDQ-NEXT: vzeroupper 461 ; AVX512VPOPCNTDQ-NEXT: retq 462 ; 463 ; AVX512VPOPCNTDQVL-LABEL: testv8i16: 464 ; AVX512VPOPCNTDQVL: # %bb.0: 465 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 466 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 467 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 468 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper 469 ; AVX512VPOPCNTDQVL-NEXT: retq 470 ; 471 ; BITALG_NOVLX-LABEL: testv8i16: 472 ; BITALG_NOVLX: # %bb.0: 473 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 474 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 475 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 476 ; BITALG_NOVLX-NEXT: vzeroupper 477 ; BITALG_NOVLX-NEXT: retq 478 ; 479 ; BITALG-LABEL: testv8i16: 480 ; BITALG: # %bb.0: 481 ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 482 ; BITALG-NEXT: retq 483 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) 484 ret <8 x i16> %out 485 } 486 487 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { 488 ; SSE2-LABEL: testv16i8: 489 ; SSE2: # %bb.0: 490 ; SSE2-NEXT: movdqa %xmm0, %xmm1 491 ; SSE2-NEXT: psrlw $1, %xmm1 492 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 493 ; SSE2-NEXT: psubb %xmm1, %xmm0 494 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 495 ; SSE2-NEXT: movdqa %xmm0, %xmm2 496 ; SSE2-NEXT: pand %xmm1, %xmm2 497 ; SSE2-NEXT: psrlw $2, %xmm0 498 ; SSE2-NEXT: pand %xmm1, %xmm0 499 ; SSE2-NEXT: paddb %xmm2, %xmm0 500 ; SSE2-NEXT: movdqa %xmm0, %xmm1 501 ; SSE2-NEXT: psrlw $4, %xmm1 502 ; SSE2-NEXT: paddb %xmm0, %xmm1 503 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 504 ; SSE2-NEXT: movdqa %xmm1, %xmm0 505 ; SSE2-NEXT: retq 506 ; 507 ; SSE3-LABEL: testv16i8: 508 ; SSE3: # %bb.0: 509 ; SSE3-NEXT: movdqa %xmm0, %xmm1 510 ; SSE3-NEXT: psrlw $1, %xmm1 511 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 512 ; SSE3-NEXT: psubb %xmm1, %xmm0 513 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 514 ; SSE3-NEXT: movdqa %xmm0, %xmm2 515 ; SSE3-NEXT: pand %xmm1, %xmm2 516 ; SSE3-NEXT: psrlw $2, %xmm0 517 ; SSE3-NEXT: pand %xmm1, %xmm0 518 ; SSE3-NEXT: paddb %xmm2, %xmm0 519 ; SSE3-NEXT: movdqa %xmm0, %xmm1 520 ; SSE3-NEXT: psrlw $4, %xmm1 521 ; SSE3-NEXT: paddb %xmm0, %xmm1 522 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 523 ; SSE3-NEXT: movdqa %xmm1, %xmm0 524 ; SSE3-NEXT: retq 525 ; 526 ; SSSE3-LABEL: testv16i8: 527 ; SSSE3: # %bb.0: 528 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 529 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 530 ; SSSE3-NEXT: pand %xmm2, %xmm3 531 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 532 ; SSSE3-NEXT: movdqa %xmm1, %xmm4 533 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 534 ; SSSE3-NEXT: psrlw $4, %xmm0 535 ; SSSE3-NEXT: pand %xmm2, %xmm0 536 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 537 ; SSSE3-NEXT: paddb %xmm4, %xmm1 538 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 539 ; SSSE3-NEXT: retq 540 ; 541 ; SSE41-LABEL: testv16i8: 542 ; SSE41: # %bb.0: 543 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 544 ; SSE41-NEXT: movdqa %xmm0, %xmm3 545 ; SSE41-NEXT: pand %xmm2, %xmm3 546 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 547 ; SSE41-NEXT: movdqa %xmm1, %xmm4 548 ; SSE41-NEXT: pshufb %xmm3, %xmm4 549 ; SSE41-NEXT: psrlw $4, %xmm0 550 ; SSE41-NEXT: pand %xmm2, %xmm0 551 ; SSE41-NEXT: pshufb %xmm0, %xmm1 552 ; SSE41-NEXT: paddb %xmm4, %xmm1 553 ; SSE41-NEXT: movdqa %xmm1, %xmm0 554 ; SSE41-NEXT: retq 555 ; 556 ; AVX1-LABEL: testv16i8: 557 ; AVX1: # %bb.0: 558 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 559 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 560 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 561 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 562 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 563 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 564 ; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 565 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 566 ; AVX1-NEXT: retq 567 ; 568 ; AVX2-LABEL: testv16i8: 569 ; AVX2: # %bb.0: 570 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 571 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 572 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] 573 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 574 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 575 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 576 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 577 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 578 ; AVX2-NEXT: retq 579 ; 580 ; AVX512VPOPCNTDQ-LABEL: testv16i8: 581 ; AVX512VPOPCNTDQ: # %bb.0: 582 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 583 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 584 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 585 ; AVX512VPOPCNTDQ-NEXT: vzeroupper 586 ; AVX512VPOPCNTDQ-NEXT: retq 587 ; 588 ; AVX512VPOPCNTDQVL-LABEL: testv16i8: 589 ; AVX512VPOPCNTDQVL: # %bb.0: 590 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 591 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 592 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 593 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper 594 ; AVX512VPOPCNTDQVL-NEXT: retq 595 ; 596 ; BITALG_NOVLX-LABEL: testv16i8: 597 ; BITALG_NOVLX: # %bb.0: 598 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 599 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 600 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 601 ; BITALG_NOVLX-NEXT: vzeroupper 602 ; BITALG_NOVLX-NEXT: retq 603 ; 604 ; BITALG-LABEL: testv16i8: 605 ; BITALG: # %bb.0: 606 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 607 ; BITALG-NEXT: retq 608 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) 609 ret <16 x i8> %out 610 } 611 612 define <2 x i64> @foldv2i64() nounwind { 613 ; SSE-LABEL: foldv2i64: 614 ; SSE: # %bb.0: 615 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64] 616 ; SSE-NEXT: retq 617 ; 618 ; AVX-LABEL: foldv2i64: 619 ; AVX: # %bb.0: 620 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] 621 ; AVX-NEXT: retq 622 ; 623 ; BITALG_NOVLX-LABEL: foldv2i64: 624 ; BITALG_NOVLX: # %bb.0: 625 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] 626 ; BITALG_NOVLX-NEXT: retq 627 ; 628 ; BITALG-LABEL: foldv2i64: 629 ; BITALG: # %bb.0: 630 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] 631 ; BITALG-NEXT: retq 632 %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>) 633 ret <2 x i64> %out 634 } 635 636 define <4 x i32> @foldv4i32() nounwind { 637 ; SSE-LABEL: foldv4i32: 638 ; SSE: # %bb.0: 639 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8] 640 ; SSE-NEXT: retq 641 ; 642 ; AVX-LABEL: foldv4i32: 643 ; AVX: # %bb.0: 644 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] 645 ; AVX-NEXT: retq 646 ; 647 ; BITALG_NOVLX-LABEL: foldv4i32: 648 ; BITALG_NOVLX: # %bb.0: 649 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] 650 ; BITALG_NOVLX-NEXT: retq 651 ; 652 ; BITALG-LABEL: foldv4i32: 653 ; BITALG: # %bb.0: 654 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] 655 ; BITALG-NEXT: retq 656 %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>) 657 ret <4 x i32> %out 658 } 659 660 define <8 x i16> @foldv8i16() nounwind { 661 ; SSE-LABEL: foldv8i16: 662 ; SSE: # %bb.0: 663 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 664 ; SSE-NEXT: retq 665 ; 666 ; AVX-LABEL: foldv8i16: 667 ; AVX: # %bb.0: 668 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 669 ; AVX-NEXT: retq 670 ; 671 ; BITALG_NOVLX-LABEL: foldv8i16: 672 ; BITALG_NOVLX: # %bb.0: 673 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 674 ; BITALG_NOVLX-NEXT: retq 675 ; 676 ; BITALG-LABEL: foldv8i16: 677 ; BITALG: # %bb.0: 678 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] 679 ; BITALG-NEXT: retq 680 %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>) 681 ret <8 x i16> %out 682 } 683 684 define <16 x i8> @foldv16i8() nounwind { 685 ; SSE-LABEL: foldv16i8: 686 ; SSE: # %bb.0: 687 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 688 ; SSE-NEXT: retq 689 ; 690 ; AVX-LABEL: foldv16i8: 691 ; AVX: # %bb.0: 692 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 693 ; AVX-NEXT: retq 694 ; 695 ; BITALG_NOVLX-LABEL: foldv16i8: 696 ; BITALG_NOVLX: # %bb.0: 697 ; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 698 ; BITALG_NOVLX-NEXT: retq 699 ; 700 ; BITALG-LABEL: foldv16i8: 701 ; BITALG: # %bb.0: 702 ; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] 703 ; BITALG-NEXT: retq 704 %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>) 705 ret <16 x i8> %out 706 } 707 708 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) 709 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) 710 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) 711 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) 712