Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
     10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
     12 
     13 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
     14 ; SSE2-LABEL: testv2i64:
     15 ; SSE2:       # %bb.0:
     16 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
     17 ; SSE2-NEXT:    psrlq $1, %xmm1
     18 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
     19 ; SSE2-NEXT:    psubq %xmm1, %xmm0
     20 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
     21 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
     22 ; SSE2-NEXT:    pand %xmm1, %xmm2
     23 ; SSE2-NEXT:    psrlq $2, %xmm0
     24 ; SSE2-NEXT:    pand %xmm1, %xmm0
     25 ; SSE2-NEXT:    paddq %xmm2, %xmm0
     26 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
     27 ; SSE2-NEXT:    psrlq $4, %xmm1
     28 ; SSE2-NEXT:    paddq %xmm0, %xmm1
     29 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
     30 ; SSE2-NEXT:    pxor %xmm0, %xmm0
     31 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
     32 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
     33 ; SSE2-NEXT:    retq
     34 ;
     35 ; SSE3-LABEL: testv2i64:
     36 ; SSE3:       # %bb.0:
     37 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
     38 ; SSE3-NEXT:    psrlq $1, %xmm1
     39 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
     40 ; SSE3-NEXT:    psubq %xmm1, %xmm0
     41 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
     42 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
     43 ; SSE3-NEXT:    pand %xmm1, %xmm2
     44 ; SSE3-NEXT:    psrlq $2, %xmm0
     45 ; SSE3-NEXT:    pand %xmm1, %xmm0
     46 ; SSE3-NEXT:    paddq %xmm2, %xmm0
     47 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
     48 ; SSE3-NEXT:    psrlq $4, %xmm1
     49 ; SSE3-NEXT:    paddq %xmm0, %xmm1
     50 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
     51 ; SSE3-NEXT:    pxor %xmm0, %xmm0
     52 ; SSE3-NEXT:    psadbw %xmm0, %xmm1
     53 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
     54 ; SSE3-NEXT:    retq
     55 ;
     56 ; SSSE3-LABEL: testv2i64:
     57 ; SSSE3:       # %bb.0:
     58 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     59 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
     60 ; SSSE3-NEXT:    pand %xmm1, %xmm2
     61 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     62 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
     63 ; SSSE3-NEXT:    pshufb %xmm2, %xmm4
     64 ; SSSE3-NEXT:    psrlw $4, %xmm0
     65 ; SSSE3-NEXT:    pand %xmm1, %xmm0
     66 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
     67 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
     68 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
     69 ; SSSE3-NEXT:    psadbw %xmm3, %xmm0
     70 ; SSSE3-NEXT:    retq
     71 ;
     72 ; SSE41-LABEL: testv2i64:
     73 ; SSE41:       # %bb.0:
     74 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     75 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
     76 ; SSE41-NEXT:    pand %xmm1, %xmm2
     77 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     78 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
     79 ; SSE41-NEXT:    pshufb %xmm2, %xmm4
     80 ; SSE41-NEXT:    psrlw $4, %xmm0
     81 ; SSE41-NEXT:    pand %xmm1, %xmm0
     82 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
     83 ; SSE41-NEXT:    paddb %xmm4, %xmm3
     84 ; SSE41-NEXT:    pxor %xmm0, %xmm0
     85 ; SSE41-NEXT:    psadbw %xmm3, %xmm0
     86 ; SSE41-NEXT:    retq
     87 ;
     88 ; AVX1-LABEL: testv2i64:
     89 ; AVX1:       # %bb.0:
     90 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     91 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
     92 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     93 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
     94 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
     95 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
     96 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
     97 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
     98 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     99 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    100 ; AVX1-NEXT:    retq
    101 ;
    102 ; AVX2-LABEL: testv2i64:
    103 ; AVX2:       # %bb.0:
    104 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    105 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
    106 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    107 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    108 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    109 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    110 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    111 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    112 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    113 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    114 ; AVX2-NEXT:    retq
    115 ;
    116 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
    117 ; AVX512VPOPCNTDQ:       # %bb.0:
    118 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    119 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
    120 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    121 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
    122 ; AVX512VPOPCNTDQ-NEXT:    retq
    123 ;
    124 ; AVX512VPOPCNTDQVL-LABEL: testv2i64:
    125 ; AVX512VPOPCNTDQVL:       # %bb.0:
    126 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
    127 ; AVX512VPOPCNTDQVL-NEXT:    retq
    128 ;
    129 ; BITALG_NOVLX-LABEL: testv2i64:
    130 ; BITALG_NOVLX:       # %bb.0:
    131 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    132 ; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    133 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    134 ; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    135 ; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    136 ; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    137 ; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    138 ; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    139 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    140 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    141 ; BITALG_NOVLX-NEXT:    retq
    142 ;
    143 ; BITALG-LABEL: testv2i64:
    144 ; BITALG:       # %bb.0:
    145 ; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    146 ; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
    147 ; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    148 ; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    149 ; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
    150 ; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
    151 ; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    152 ; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    153 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    154 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    155 ; BITALG-NEXT:    retq
    156   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
    157   ret <2 x i64> %out
    158 }
    159 
    160 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
    161 ; SSE2-LABEL: testv4i32:
    162 ; SSE2:       # %bb.0:
    163 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    164 ; SSE2-NEXT:    psrld $1, %xmm1
    165 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    166 ; SSE2-NEXT:    psubd %xmm1, %xmm0
    167 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
    168 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    169 ; SSE2-NEXT:    pand %xmm1, %xmm2
    170 ; SSE2-NEXT:    psrld $2, %xmm0
    171 ; SSE2-NEXT:    pand %xmm1, %xmm0
    172 ; SSE2-NEXT:    paddd %xmm2, %xmm0
    173 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    174 ; SSE2-NEXT:    psrld $4, %xmm1
    175 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    176 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    177 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    178 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    179 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    180 ; SSE2-NEXT:    psadbw %xmm0, %xmm2
    181 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    182 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
    183 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
    184 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    185 ; SSE2-NEXT:    retq
    186 ;
    187 ; SSE3-LABEL: testv4i32:
    188 ; SSE3:       # %bb.0:
    189 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    190 ; SSE3-NEXT:    psrld $1, %xmm1
    191 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    192 ; SSE3-NEXT:    psubd %xmm1, %xmm0
    193 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
    194 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    195 ; SSE3-NEXT:    pand %xmm1, %xmm2
    196 ; SSE3-NEXT:    psrld $2, %xmm0
    197 ; SSE3-NEXT:    pand %xmm1, %xmm0
    198 ; SSE3-NEXT:    paddd %xmm2, %xmm0
    199 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    200 ; SSE3-NEXT:    psrld $4, %xmm1
    201 ; SSE3-NEXT:    paddd %xmm0, %xmm1
    202 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    203 ; SSE3-NEXT:    pxor %xmm0, %xmm0
    204 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
    205 ; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    206 ; SSE3-NEXT:    psadbw %xmm0, %xmm2
    207 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    208 ; SSE3-NEXT:    psadbw %xmm0, %xmm1
    209 ; SSE3-NEXT:    packuswb %xmm2, %xmm1
    210 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    211 ; SSE3-NEXT:    retq
    212 ;
    213 ; SSSE3-LABEL: testv4i32:
    214 ; SSSE3:       # %bb.0:
    215 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    216 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
    217 ; SSSE3-NEXT:    pand %xmm2, %xmm3
    218 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    219 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
    220 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
    221 ; SSSE3-NEXT:    psrlw $4, %xmm0
    222 ; SSSE3-NEXT:    pand %xmm2, %xmm0
    223 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    224 ; SSSE3-NEXT:    paddb %xmm4, %xmm1
    225 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
    226 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
    227 ; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    228 ; SSSE3-NEXT:    psadbw %xmm0, %xmm2
    229 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    230 ; SSSE3-NEXT:    psadbw %xmm0, %xmm1
    231 ; SSSE3-NEXT:    packuswb %xmm2, %xmm1
    232 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    233 ; SSSE3-NEXT:    retq
    234 ;
    235 ; SSE41-LABEL: testv4i32:
    236 ; SSE41:       # %bb.0:
    237 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    238 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    239 ; SSE41-NEXT:    pand %xmm1, %xmm2
    240 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    241 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
    242 ; SSE41-NEXT:    pshufb %xmm2, %xmm4
    243 ; SSE41-NEXT:    psrlw $4, %xmm0
    244 ; SSE41-NEXT:    pand %xmm1, %xmm0
    245 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
    246 ; SSE41-NEXT:    paddb %xmm4, %xmm3
    247 ; SSE41-NEXT:    pxor %xmm1, %xmm1
    248 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
    249 ; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
    250 ; SSE41-NEXT:    psadbw %xmm1, %xmm3
    251 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
    252 ; SSE41-NEXT:    packuswb %xmm3, %xmm0
    253 ; SSE41-NEXT:    retq
    254 ;
    255 ; AVX1-LABEL: testv4i32:
    256 ; AVX1:       # %bb.0:
    257 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    258 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
    259 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    260 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    261 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    262 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    263 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    264 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    265 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    266 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    267 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    268 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    269 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    270 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    271 ; AVX1-NEXT:    retq
    272 ;
    273 ; AVX2-LABEL: testv4i32:
    274 ; AVX2:       # %bb.0:
    275 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    276 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
    277 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    278 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    279 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    280 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    281 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    282 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    283 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    284 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    285 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    286 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    287 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    288 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    289 ; AVX2-NEXT:    retq
    290 ;
    291 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
    292 ; AVX512VPOPCNTDQ:       # %bb.0:
    293 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    294 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    295 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    296 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
    297 ; AVX512VPOPCNTDQ-NEXT:    retq
    298 ;
    299 ; AVX512VPOPCNTDQVL-LABEL: testv4i32:
    300 ; AVX512VPOPCNTDQVL:       # %bb.0:
    301 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
    302 ; AVX512VPOPCNTDQVL-NEXT:    retq
    303 ;
    304 ; BITALG_NOVLX-LABEL: testv4i32:
    305 ; BITALG_NOVLX:       # %bb.0:
    306 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    307 ; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    308 ; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    309 ; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    310 ; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    311 ; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    312 ; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    313 ; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    314 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    315 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    316 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    317 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    318 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    319 ; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    320 ; BITALG_NOVLX-NEXT:    retq
    321 ;
    322 ; BITALG-LABEL: testv4i32:
    323 ; BITALG:       # %bb.0:
    324 ; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    325 ; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
    326 ; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    327 ; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    328 ; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
    329 ; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
    330 ; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    331 ; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    332 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    333 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    334 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    335 ; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    336 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    337 ; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    338 ; BITALG-NEXT:    retq
    339   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
    340   ret <4 x i32> %out
    341 }
    342 
    343 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
    344 ; SSE2-LABEL: testv8i16:
    345 ; SSE2:       # %bb.0:
    346 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    347 ; SSE2-NEXT:    psrlw $1, %xmm1
    348 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    349 ; SSE2-NEXT:    psubw %xmm1, %xmm0
    350 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
    351 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    352 ; SSE2-NEXT:    pand %xmm1, %xmm2
    353 ; SSE2-NEXT:    psrlw $2, %xmm0
    354 ; SSE2-NEXT:    pand %xmm1, %xmm0
    355 ; SSE2-NEXT:    paddw %xmm2, %xmm0
    356 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    357 ; SSE2-NEXT:    psrlw $4, %xmm1
    358 ; SSE2-NEXT:    paddw %xmm0, %xmm1
    359 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    360 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    361 ; SSE2-NEXT:    psllw $8, %xmm0
    362 ; SSE2-NEXT:    paddb %xmm1, %xmm0
    363 ; SSE2-NEXT:    psrlw $8, %xmm0
    364 ; SSE2-NEXT:    retq
    365 ;
    366 ; SSE3-LABEL: testv8i16:
    367 ; SSE3:       # %bb.0:
    368 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    369 ; SSE3-NEXT:    psrlw $1, %xmm1
    370 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    371 ; SSE3-NEXT:    psubw %xmm1, %xmm0
    372 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
    373 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    374 ; SSE3-NEXT:    pand %xmm1, %xmm2
    375 ; SSE3-NEXT:    psrlw $2, %xmm0
    376 ; SSE3-NEXT:    pand %xmm1, %xmm0
    377 ; SSE3-NEXT:    paddw %xmm2, %xmm0
    378 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    379 ; SSE3-NEXT:    psrlw $4, %xmm1
    380 ; SSE3-NEXT:    paddw %xmm0, %xmm1
    381 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    382 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    383 ; SSE3-NEXT:    psllw $8, %xmm0
    384 ; SSE3-NEXT:    paddb %xmm1, %xmm0
    385 ; SSE3-NEXT:    psrlw $8, %xmm0
    386 ; SSE3-NEXT:    retq
    387 ;
    388 ; SSSE3-LABEL: testv8i16:
    389 ; SSSE3:       # %bb.0:
    390 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    391 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    392 ; SSSE3-NEXT:    pand %xmm1, %xmm2
    393 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    394 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
    395 ; SSSE3-NEXT:    pshufb %xmm2, %xmm4
    396 ; SSSE3-NEXT:    psrlw $4, %xmm0
    397 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    398 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
    399 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
    400 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    401 ; SSSE3-NEXT:    psllw $8, %xmm0
    402 ; SSSE3-NEXT:    paddb %xmm3, %xmm0
    403 ; SSSE3-NEXT:    psrlw $8, %xmm0
    404 ; SSSE3-NEXT:    retq
    405 ;
    406 ; SSE41-LABEL: testv8i16:
    407 ; SSE41:       # %bb.0:
    408 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    409 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    410 ; SSE41-NEXT:    pand %xmm1, %xmm2
    411 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    412 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
    413 ; SSE41-NEXT:    pshufb %xmm2, %xmm4
    414 ; SSE41-NEXT:    psrlw $4, %xmm0
    415 ; SSE41-NEXT:    pand %xmm1, %xmm0
    416 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
    417 ; SSE41-NEXT:    paddb %xmm4, %xmm3
    418 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    419 ; SSE41-NEXT:    psllw $8, %xmm0
    420 ; SSE41-NEXT:    paddb %xmm3, %xmm0
    421 ; SSE41-NEXT:    psrlw $8, %xmm0
    422 ; SSE41-NEXT:    retq
    423 ;
    424 ; AVX1-LABEL: testv8i16:
    425 ; AVX1:       # %bb.0:
    426 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    427 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
    428 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    429 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    430 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    431 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    432 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    433 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    434 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
    435 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    436 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
    437 ; AVX1-NEXT:    retq
    438 ;
    439 ; AVX2-LABEL: testv8i16:
    440 ; AVX2:       # %bb.0:
    441 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    442 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
    443 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    444 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    445 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    446 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    447 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    448 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    449 ; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
    450 ; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    451 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
    452 ; AVX2-NEXT:    retq
    453 ;
    454 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
    455 ; AVX512VPOPCNTDQ:       # %bb.0:
    456 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    457 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    458 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
    459 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    460 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
    461 ; AVX512VPOPCNTDQ-NEXT:    retq
    462 ;
    463 ; AVX512VPOPCNTDQVL-LABEL: testv8i16:
    464 ; AVX512VPOPCNTDQVL:       # %bb.0:
    465 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    466 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
    467 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
    468 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
    469 ; AVX512VPOPCNTDQVL-NEXT:    retq
    470 ;
    471 ; BITALG_NOVLX-LABEL: testv8i16:
    472 ; BITALG_NOVLX:       # %bb.0:
    473 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    474 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
    475 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    476 ; BITALG_NOVLX-NEXT:    vzeroupper
    477 ; BITALG_NOVLX-NEXT:    retq
    478 ;
    479 ; BITALG-LABEL: testv8i16:
    480 ; BITALG:       # %bb.0:
    481 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
    482 ; BITALG-NEXT:    retq
    483   %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
    484   ret <8 x i16> %out
    485 }
    486 
    487 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
    488 ; SSE2-LABEL: testv16i8:
    489 ; SSE2:       # %bb.0:
    490 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    491 ; SSE2-NEXT:    psrlw $1, %xmm1
    492 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    493 ; SSE2-NEXT:    psubb %xmm1, %xmm0
    494 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    495 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    496 ; SSE2-NEXT:    pand %xmm1, %xmm2
    497 ; SSE2-NEXT:    psrlw $2, %xmm0
    498 ; SSE2-NEXT:    pand %xmm1, %xmm0
    499 ; SSE2-NEXT:    paddb %xmm2, %xmm0
    500 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    501 ; SSE2-NEXT:    psrlw $4, %xmm1
    502 ; SSE2-NEXT:    paddb %xmm0, %xmm1
    503 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    504 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    505 ; SSE2-NEXT:    retq
    506 ;
    507 ; SSE3-LABEL: testv16i8:
    508 ; SSE3:       # %bb.0:
    509 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    510 ; SSE3-NEXT:    psrlw $1, %xmm1
    511 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    512 ; SSE3-NEXT:    psubb %xmm1, %xmm0
    513 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    514 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    515 ; SSE3-NEXT:    pand %xmm1, %xmm2
    516 ; SSE3-NEXT:    psrlw $2, %xmm0
    517 ; SSE3-NEXT:    pand %xmm1, %xmm0
    518 ; SSE3-NEXT:    paddb %xmm2, %xmm0
    519 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    520 ; SSE3-NEXT:    psrlw $4, %xmm1
    521 ; SSE3-NEXT:    paddb %xmm0, %xmm1
    522 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    523 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    524 ; SSE3-NEXT:    retq
    525 ;
    526 ; SSSE3-LABEL: testv16i8:
    527 ; SSSE3:       # %bb.0:
    528 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    529 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
    530 ; SSSE3-NEXT:    pand %xmm2, %xmm3
    531 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    532 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
    533 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
    534 ; SSSE3-NEXT:    psrlw $4, %xmm0
    535 ; SSSE3-NEXT:    pand %xmm2, %xmm0
    536 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    537 ; SSSE3-NEXT:    paddb %xmm4, %xmm1
    538 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    539 ; SSSE3-NEXT:    retq
    540 ;
    541 ; SSE41-LABEL: testv16i8:
    542 ; SSE41:       # %bb.0:
    543 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    544 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    545 ; SSE41-NEXT:    pand %xmm2, %xmm3
    546 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    547 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
    548 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
    549 ; SSE41-NEXT:    psrlw $4, %xmm0
    550 ; SSE41-NEXT:    pand %xmm2, %xmm0
    551 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
    552 ; SSE41-NEXT:    paddb %xmm4, %xmm1
    553 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    554 ; SSE41-NEXT:    retq
    555 ;
    556 ; AVX1-LABEL: testv16i8:
    557 ; AVX1:       # %bb.0:
    558 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    559 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
    560 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    561 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    562 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    563 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
    564 ; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    565 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    566 ; AVX1-NEXT:    retq
    567 ;
    568 ; AVX2-LABEL: testv16i8:
    569 ; AVX2:       # %bb.0:
    570 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    571 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
    572 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    573 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    574 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
    575 ; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
    576 ; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    577 ; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    578 ; AVX2-NEXT:    retq
    579 ;
    580 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
    581 ; AVX512VPOPCNTDQ:       # %bb.0:
    582 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    583 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
    584 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
    585 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
    586 ; AVX512VPOPCNTDQ-NEXT:    retq
    587 ;
    588 ; AVX512VPOPCNTDQVL-LABEL: testv16i8:
    589 ; AVX512VPOPCNTDQVL:       # %bb.0:
    590 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    591 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
    592 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
    593 ; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
    594 ; AVX512VPOPCNTDQVL-NEXT:    retq
    595 ;
    596 ; BITALG_NOVLX-LABEL: testv16i8:
    597 ; BITALG_NOVLX:       # %bb.0:
    598 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
    599 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
    600 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    601 ; BITALG_NOVLX-NEXT:    vzeroupper
    602 ; BITALG_NOVLX-NEXT:    retq
    603 ;
    604 ; BITALG-LABEL: testv16i8:
    605 ; BITALG:       # %bb.0:
    606 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
    607 ; BITALG-NEXT:    retq
    608   %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
    609   ret <16 x i8> %out
    610 }
    611 
    612 define <2 x i64> @foldv2i64() nounwind {
    613 ; SSE-LABEL: foldv2i64:
    614 ; SSE:       # %bb.0:
    615 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,64]
    616 ; SSE-NEXT:    retq
    617 ;
    618 ; AVX-LABEL: foldv2i64:
    619 ; AVX:       # %bb.0:
    620 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
    621 ; AVX-NEXT:    retq
    622 ;
    623 ; BITALG_NOVLX-LABEL: foldv2i64:
    624 ; BITALG_NOVLX:       # %bb.0:
    625 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
    626 ; BITALG_NOVLX-NEXT:    retq
    627 ;
    628 ; BITALG-LABEL: foldv2i64:
    629 ; BITALG:       # %bb.0:
    630 ; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
    631 ; BITALG-NEXT:    retq
    632   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
    633   ret <2 x i64> %out
    634 }
    635 
    636 define <4 x i32> @foldv4i32() nounwind {
    637 ; SSE-LABEL: foldv4i32:
    638 ; SSE:       # %bb.0:
    639 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,32,0,8]
    640 ; SSE-NEXT:    retq
    641 ;
    642 ; AVX-LABEL: foldv4i32:
    643 ; AVX:       # %bb.0:
    644 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
    645 ; AVX-NEXT:    retq
    646 ;
    647 ; BITALG_NOVLX-LABEL: foldv4i32:
    648 ; BITALG_NOVLX:       # %bb.0:
    649 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
    650 ; BITALG_NOVLX-NEXT:    retq
    651 ;
    652 ; BITALG-LABEL: foldv4i32:
    653 ; BITALG:       # %bb.0:
    654 ; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
    655 ; BITALG-NEXT:    retq
    656   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
    657   ret <4 x i32> %out
    658 }
    659 
    660 define <8 x i16> @foldv8i16() nounwind {
    661 ; SSE-LABEL: foldv8i16:
    662 ; SSE:       # %bb.0:
    663 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
    664 ; SSE-NEXT:    retq
    665 ;
    666 ; AVX-LABEL: foldv8i16:
    667 ; AVX:       # %bb.0:
    668 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
    669 ; AVX-NEXT:    retq
    670 ;
    671 ; BITALG_NOVLX-LABEL: foldv8i16:
    672 ; BITALG_NOVLX:       # %bb.0:
    673 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
    674 ; BITALG_NOVLX-NEXT:    retq
    675 ;
    676 ; BITALG-LABEL: foldv8i16:
    677 ; BITALG:       # %bb.0:
    678 ; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
    679 ; BITALG-NEXT:    retq
    680   %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>)
    681   ret <8 x i16> %out
    682 }
    683 
    684 define <16 x i8> @foldv16i8() nounwind {
    685 ; SSE-LABEL: foldv16i8:
    686 ; SSE:       # %bb.0:
    687 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
    688 ; SSE-NEXT:    retq
    689 ;
    690 ; AVX-LABEL: foldv16i8:
    691 ; AVX:       # %bb.0:
    692 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
    693 ; AVX-NEXT:    retq
    694 ;
    695 ; BITALG_NOVLX-LABEL: foldv16i8:
    696 ; BITALG_NOVLX:       # %bb.0:
    697 ; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
    698 ; BITALG_NOVLX-NEXT:    retq
    699 ;
    700 ; BITALG-LABEL: foldv16i8:
    701 ; BITALG:       # %bb.0:
    702 ; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
    703 ; BITALG-NEXT:    retq
    704   %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>)
    705   ret <16 x i8> %out
    706 }
    707 
    708 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
    709 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
    710 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
    711 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
    712