Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      8 
      9 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
     10 ; SSE2-LABEL: testv2i64:
     11 ; SSE2:       # BB#0:
     12 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
     13 ; SSE2-NEXT:    psrlq $1, %xmm1
     14 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
     15 ; SSE2-NEXT:    psubq %xmm1, %xmm0
     16 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
     17 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
     18 ; SSE2-NEXT:    pand %xmm1, %xmm2
     19 ; SSE2-NEXT:    psrlq $2, %xmm0
     20 ; SSE2-NEXT:    pand %xmm1, %xmm0
     21 ; SSE2-NEXT:    paddq %xmm2, %xmm0
     22 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
     23 ; SSE2-NEXT:    psrlq $4, %xmm1
     24 ; SSE2-NEXT:    paddq %xmm0, %xmm1
     25 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
     26 ; SSE2-NEXT:    pxor %xmm0, %xmm0
     27 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
     28 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
     29 ; SSE2-NEXT:    retq
     30 ;
     31 ; SSE3-LABEL: testv2i64:
     32 ; SSE3:       # BB#0:
     33 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
     34 ; SSE3-NEXT:    psrlq $1, %xmm1
     35 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
     36 ; SSE3-NEXT:    psubq %xmm1, %xmm0
     37 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
     38 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
     39 ; SSE3-NEXT:    pand %xmm1, %xmm2
     40 ; SSE3-NEXT:    psrlq $2, %xmm0
     41 ; SSE3-NEXT:    pand %xmm1, %xmm0
     42 ; SSE3-NEXT:    paddq %xmm2, %xmm0
     43 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
     44 ; SSE3-NEXT:    psrlq $4, %xmm1
     45 ; SSE3-NEXT:    paddq %xmm0, %xmm1
     46 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
     47 ; SSE3-NEXT:    pxor %xmm0, %xmm0
     48 ; SSE3-NEXT:    psadbw %xmm0, %xmm1
     49 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
     50 ; SSE3-NEXT:    retq
     51 ;
     52 ; SSSE3-LABEL: testv2i64:
     53 ; SSSE3:       # BB#0:
     54 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     55 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
     56 ; SSSE3-NEXT:    pand %xmm1, %xmm2
     57 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     58 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
     59 ; SSSE3-NEXT:    pshufb %xmm2, %xmm4
     60 ; SSSE3-NEXT:    psrlw $4, %xmm0
     61 ; SSSE3-NEXT:    pand %xmm1, %xmm0
     62 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
     63 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
     64 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
     65 ; SSSE3-NEXT:    psadbw %xmm3, %xmm0
     66 ; SSSE3-NEXT:    retq
     67 ;
     68 ; SSE41-LABEL: testv2i64:
     69 ; SSE41:       # BB#0:
     70 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     71 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
     72 ; SSE41-NEXT:    pand %xmm1, %xmm2
     73 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     74 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
     75 ; SSE41-NEXT:    pshufb %xmm2, %xmm4
     76 ; SSE41-NEXT:    psrlw $4, %xmm0
     77 ; SSE41-NEXT:    pand %xmm1, %xmm0
     78 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
     79 ; SSE41-NEXT:    paddb %xmm4, %xmm3
     80 ; SSE41-NEXT:    pxor %xmm0, %xmm0
     81 ; SSE41-NEXT:    psadbw %xmm3, %xmm0
     82 ; SSE41-NEXT:    retq
     83 ;
     84 ; AVX-LABEL: testv2i64:
     85 ; AVX:       # BB#0:
     86 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
     87 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
     88 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
     89 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
     90 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
     91 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
     92 ; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
     93 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
     94 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
     95 ; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
     96 ; AVX-NEXT:    retq
     97   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
     98   ret <2 x i64> %out
     99 }
    100 
    101 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
    102 ; SSE2-LABEL: testv4i32:
    103 ; SSE2:       # BB#0:
    104 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    105 ; SSE2-NEXT:    psrld $1, %xmm1
    106 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    107 ; SSE2-NEXT:    psubd %xmm1, %xmm0
    108 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
    109 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    110 ; SSE2-NEXT:    pand %xmm1, %xmm2
    111 ; SSE2-NEXT:    psrld $2, %xmm0
    112 ; SSE2-NEXT:    pand %xmm1, %xmm0
    113 ; SSE2-NEXT:    paddd %xmm2, %xmm0
    114 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    115 ; SSE2-NEXT:    psrld $4, %xmm1
    116 ; SSE2-NEXT:    paddd %xmm0, %xmm1
    117 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    118 ; SSE2-NEXT:    pxor %xmm0, %xmm0
    119 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
    120 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    121 ; SSE2-NEXT:    psadbw %xmm0, %xmm2
    122 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    123 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
    124 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
    125 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    126 ; SSE2-NEXT:    retq
    127 ;
    128 ; SSE3-LABEL: testv4i32:
    129 ; SSE3:       # BB#0:
    130 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    131 ; SSE3-NEXT:    psrld $1, %xmm1
    132 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    133 ; SSE3-NEXT:    psubd %xmm1, %xmm0
    134 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
    135 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    136 ; SSE3-NEXT:    pand %xmm1, %xmm2
    137 ; SSE3-NEXT:    psrld $2, %xmm0
    138 ; SSE3-NEXT:    pand %xmm1, %xmm0
    139 ; SSE3-NEXT:    paddd %xmm2, %xmm0
    140 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    141 ; SSE3-NEXT:    psrld $4, %xmm1
    142 ; SSE3-NEXT:    paddd %xmm0, %xmm1
    143 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    144 ; SSE3-NEXT:    pxor %xmm0, %xmm0
    145 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
    146 ; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    147 ; SSE3-NEXT:    psadbw %xmm0, %xmm2
    148 ; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    149 ; SSE3-NEXT:    psadbw %xmm0, %xmm1
    150 ; SSE3-NEXT:    packuswb %xmm2, %xmm1
    151 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    152 ; SSE3-NEXT:    retq
    153 ;
    154 ; SSSE3-LABEL: testv4i32:
    155 ; SSSE3:       # BB#0:
    156 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    157 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
    158 ; SSSE3-NEXT:    pand %xmm2, %xmm3
    159 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    160 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
    161 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
    162 ; SSSE3-NEXT:    psrlw $4, %xmm0
    163 ; SSSE3-NEXT:    pand %xmm2, %xmm0
    164 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    165 ; SSSE3-NEXT:    paddb %xmm4, %xmm1
    166 ; SSSE3-NEXT:    pxor %xmm0, %xmm0
    167 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
    168 ; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    169 ; SSSE3-NEXT:    psadbw %xmm0, %xmm2
    170 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    171 ; SSSE3-NEXT:    psadbw %xmm0, %xmm1
    172 ; SSSE3-NEXT:    packuswb %xmm2, %xmm1
    173 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    174 ; SSSE3-NEXT:    retq
    175 ;
    176 ; SSE41-LABEL: testv4i32:
    177 ; SSE41:       # BB#0:
    178 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    179 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    180 ; SSE41-NEXT:    pand %xmm2, %xmm3
    181 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    182 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
    183 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
    184 ; SSE41-NEXT:    psrlw $4, %xmm0
    185 ; SSE41-NEXT:    pand %xmm2, %xmm0
    186 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
    187 ; SSE41-NEXT:    paddb %xmm4, %xmm1
    188 ; SSE41-NEXT:    pxor %xmm0, %xmm0
    189 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
    190 ; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    191 ; SSE41-NEXT:    psadbw %xmm0, %xmm2
    192 ; SSE41-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    193 ; SSE41-NEXT:    psadbw %xmm0, %xmm1
    194 ; SSE41-NEXT:    packuswb %xmm2, %xmm1
    195 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    196 ; SSE41-NEXT:    retq
    197 ;
    198 ; AVX-LABEL: testv4i32:
    199 ; AVX:       # BB#0:
    200 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    201 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    202 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    203 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    204 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    205 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    206 ; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    207 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    208 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    209 ; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    210 ; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
    211 ; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    212 ; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
    213 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
    214 ; AVX-NEXT:    retq
    215   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
    216   ret <4 x i32> %out
    217 }
    218 
    219 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
    220 ; SSE2-LABEL: testv8i16:
    221 ; SSE2:       # BB#0:
    222 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    223 ; SSE2-NEXT:    psrlw $1, %xmm1
    224 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    225 ; SSE2-NEXT:    psubw %xmm1, %xmm0
    226 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
    227 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    228 ; SSE2-NEXT:    pand %xmm1, %xmm2
    229 ; SSE2-NEXT:    psrlw $2, %xmm0
    230 ; SSE2-NEXT:    pand %xmm1, %xmm0
    231 ; SSE2-NEXT:    paddw %xmm2, %xmm0
    232 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    233 ; SSE2-NEXT:    psrlw $4, %xmm1
    234 ; SSE2-NEXT:    paddw %xmm0, %xmm1
    235 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    236 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    237 ; SSE2-NEXT:    psllw $8, %xmm0
    238 ; SSE2-NEXT:    paddb %xmm1, %xmm0
    239 ; SSE2-NEXT:    psrlw $8, %xmm0
    240 ; SSE2-NEXT:    retq
    241 ;
    242 ; SSE3-LABEL: testv8i16:
    243 ; SSE3:       # BB#0:
    244 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    245 ; SSE3-NEXT:    psrlw $1, %xmm1
    246 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    247 ; SSE3-NEXT:    psubw %xmm1, %xmm0
    248 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
    249 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    250 ; SSE3-NEXT:    pand %xmm1, %xmm2
    251 ; SSE3-NEXT:    psrlw $2, %xmm0
    252 ; SSE3-NEXT:    pand %xmm1, %xmm0
    253 ; SSE3-NEXT:    paddw %xmm2, %xmm0
    254 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    255 ; SSE3-NEXT:    psrlw $4, %xmm1
    256 ; SSE3-NEXT:    paddw %xmm0, %xmm1
    257 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    258 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    259 ; SSE3-NEXT:    psllw $8, %xmm0
    260 ; SSE3-NEXT:    paddb %xmm1, %xmm0
    261 ; SSE3-NEXT:    psrlw $8, %xmm0
    262 ; SSE3-NEXT:    retq
    263 ;
    264 ; SSSE3-LABEL: testv8i16:
    265 ; SSSE3:       # BB#0:
    266 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    267 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    268 ; SSSE3-NEXT:    pand %xmm1, %xmm2
    269 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    270 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
    271 ; SSSE3-NEXT:    pshufb %xmm2, %xmm4
    272 ; SSSE3-NEXT:    psrlw $4, %xmm0
    273 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    274 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
    275 ; SSSE3-NEXT:    paddb %xmm4, %xmm3
    276 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    277 ; SSSE3-NEXT:    psllw $8, %xmm0
    278 ; SSSE3-NEXT:    paddb %xmm3, %xmm0
    279 ; SSSE3-NEXT:    psrlw $8, %xmm0
    280 ; SSSE3-NEXT:    retq
    281 ;
    282 ; SSE41-LABEL: testv8i16:
    283 ; SSE41:       # BB#0:
    284 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    285 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
    286 ; SSE41-NEXT:    pand %xmm1, %xmm2
    287 ; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    288 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
    289 ; SSE41-NEXT:    pshufb %xmm2, %xmm4
    290 ; SSE41-NEXT:    psrlw $4, %xmm0
    291 ; SSE41-NEXT:    pand %xmm1, %xmm0
    292 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
    293 ; SSE41-NEXT:    paddb %xmm4, %xmm3
    294 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
    295 ; SSE41-NEXT:    psllw $8, %xmm0
    296 ; SSE41-NEXT:    paddb %xmm3, %xmm0
    297 ; SSE41-NEXT:    psrlw $8, %xmm0
    298 ; SSE41-NEXT:    retq
    299 ;
    300 ; AVX-LABEL: testv8i16:
    301 ; AVX:       # BB#0:
    302 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    303 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    304 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    305 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    306 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    307 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    308 ; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    309 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    310 ; AVX-NEXT:    vpsllw $8, %xmm0, %xmm1
    311 ; AVX-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
    312 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
    313 ; AVX-NEXT:    retq
    314   %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
    315   ret <8 x i16> %out
    316 }
    317 
    318 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
    319 ; SSE2-LABEL: testv16i8:
    320 ; SSE2:       # BB#0:
    321 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    322 ; SSE2-NEXT:    psrlw $1, %xmm1
    323 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    324 ; SSE2-NEXT:    psubb %xmm1, %xmm0
    325 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    326 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    327 ; SSE2-NEXT:    pand %xmm1, %xmm2
    328 ; SSE2-NEXT:    psrlw $2, %xmm0
    329 ; SSE2-NEXT:    pand %xmm1, %xmm0
    330 ; SSE2-NEXT:    paddb %xmm2, %xmm0
    331 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
    332 ; SSE2-NEXT:    psrlw $4, %xmm1
    333 ; SSE2-NEXT:    paddb %xmm0, %xmm1
    334 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    335 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
    336 ; SSE2-NEXT:    retq
    337 ;
    338 ; SSE3-LABEL: testv16i8:
    339 ; SSE3:       # BB#0:
    340 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    341 ; SSE3-NEXT:    psrlw $1, %xmm1
    342 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    343 ; SSE3-NEXT:    psubb %xmm1, %xmm0
    344 ; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    345 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
    346 ; SSE3-NEXT:    pand %xmm1, %xmm2
    347 ; SSE3-NEXT:    psrlw $2, %xmm0
    348 ; SSE3-NEXT:    pand %xmm1, %xmm0
    349 ; SSE3-NEXT:    paddb %xmm2, %xmm0
    350 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
    351 ; SSE3-NEXT:    psrlw $4, %xmm1
    352 ; SSE3-NEXT:    paddb %xmm0, %xmm1
    353 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
    354 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
    355 ; SSE3-NEXT:    retq
    356 ;
    357 ; SSSE3-LABEL: testv16i8:
    358 ; SSSE3:       # BB#0:
    359 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    360 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
    361 ; SSSE3-NEXT:    pand %xmm2, %xmm3
    362 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    363 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
    364 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
    365 ; SSSE3-NEXT:    psrlw $4, %xmm0
    366 ; SSSE3-NEXT:    pand %xmm2, %xmm0
    367 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    368 ; SSSE3-NEXT:    paddb %xmm4, %xmm1
    369 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    370 ; SSSE3-NEXT:    retq
    371 ;
    372 ; SSE41-LABEL: testv16i8:
    373 ; SSE41:       # BB#0:
    374 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    375 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
    376 ; SSE41-NEXT:    pand %xmm2, %xmm3
    377 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    378 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
    379 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
    380 ; SSE41-NEXT:    psrlw $4, %xmm0
    381 ; SSE41-NEXT:    pand %xmm2, %xmm0
    382 ; SSE41-NEXT:    pshufb %xmm0, %xmm1
    383 ; SSE41-NEXT:    paddb %xmm4, %xmm1
    384 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
    385 ; SSE41-NEXT:    retq
    386 ;
    387 ; AVX-LABEL: testv16i8:
    388 ; AVX:       # BB#0:
    389 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    390 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    391 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
    392 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    393 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    394 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    395 ; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
    396 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    397 ; AVX-NEXT:    retq
    398   %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
    399   ret <16 x i8> %out
    400 }
    401 
    402 define <2 x i64> @foldv2i64() nounwind {
    403 ; SSE-LABEL: foldv2i64:
    404 ; SSE:       # BB#0:
    405 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,64]
    406 ; SSE-NEXT:    retq
    407 ;
    408 ; AVX-LABEL: foldv2i64:
    409 ; AVX:       # BB#0:
    410 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
    411 ; AVX-NEXT:    retq
    412   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
    413   ret <2 x i64> %out
    414 }
    415 
    416 define <4 x i32> @foldv4i32() nounwind {
    417 ; SSE-LABEL: foldv4i32:
    418 ; SSE:       # BB#0:
    419 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,32,0,8]
    420 ; SSE-NEXT:    retq
    421 ;
    422 ; AVX-LABEL: foldv4i32:
    423 ; AVX:       # BB#0:
    424 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
    425 ; AVX-NEXT:    retq
    426   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
    427   ret <4 x i32> %out
    428 }
    429 
    430 define <8 x i16> @foldv8i16() nounwind {
    431 ; SSE-LABEL: foldv8i16:
    432 ; SSE:       # BB#0:
    433 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
    434 ; SSE-NEXT:    retq
    435 ;
    436 ; AVX-LABEL: foldv8i16:
    437 ; AVX:       # BB#0:
    438 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
    439 ; AVX-NEXT:    retq
    440   %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>)
    441   ret <8 x i16> %out
    442 }
    443 
    444 define <16 x i8> @foldv16i8() nounwind {
    445 ; SSE-LABEL: foldv16i8:
    446 ; SSE:       # BB#0:
    447 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
    448 ; SSE-NEXT:    retq
    449 ;
    450 ; AVX-LABEL: foldv16i8:
    451 ; AVX:       # BB#0:
    452 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
    453 ; AVX-NEXT:    retq
    454   %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>)
    455   ret <16 x i8> %out
    456 }
    457 
    458 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
    459 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
    460 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
    461 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
    462