Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
     10 
     11 define i8 @test_bitreverse_i8(i8 %a) nounwind {
     12 ; SSE-LABEL: test_bitreverse_i8:
     13 ; SSE:       # BB#0:
     14 ; SSE-NEXT:    movl %edi, %eax
     15 ; SSE-NEXT:    shlb $7, %al
     16 ; SSE-NEXT:    movl %edi, %ecx
     17 ; SSE-NEXT:    shlb $5, %cl
     18 ; SSE-NEXT:    andb $64, %cl
     19 ; SSE-NEXT:    movl %edi, %edx
     20 ; SSE-NEXT:    shlb $3, %dl
     21 ; SSE-NEXT:    andb $32, %dl
     22 ; SSE-NEXT:    orb %cl, %dl
     23 ; SSE-NEXT:    movl %edi, %ecx
     24 ; SSE-NEXT:    addb %cl, %cl
     25 ; SSE-NEXT:    andb $16, %cl
     26 ; SSE-NEXT:    orb %dl, %cl
     27 ; SSE-NEXT:    movl %edi, %edx
     28 ; SSE-NEXT:    shrb %dl
     29 ; SSE-NEXT:    andb $8, %dl
     30 ; SSE-NEXT:    orb %cl, %dl
     31 ; SSE-NEXT:    movl %edi, %ecx
     32 ; SSE-NEXT:    shrb $3, %cl
     33 ; SSE-NEXT:    andb $4, %cl
     34 ; SSE-NEXT:    orb %dl, %cl
     35 ; SSE-NEXT:    movl %edi, %edx
     36 ; SSE-NEXT:    shrb $5, %dl
     37 ; SSE-NEXT:    andb $2, %dl
     38 ; SSE-NEXT:    orb %cl, %dl
     39 ; SSE-NEXT:    shrb $7, %dil
     40 ; SSE-NEXT:    orb %dl, %dil
     41 ; SSE-NEXT:    orb %al, %dil
     42 ; SSE-NEXT:    movl %edi, %eax
     43 ; SSE-NEXT:    retq
     44 ;
     45 ; AVX-LABEL: test_bitreverse_i8:
     46 ; AVX:       # BB#0:
     47 ; AVX-NEXT:    movl %edi, %eax
     48 ; AVX-NEXT:    shlb $7, %al
     49 ; AVX-NEXT:    movl %edi, %ecx
     50 ; AVX-NEXT:    shlb $5, %cl
     51 ; AVX-NEXT:    andb $64, %cl
     52 ; AVX-NEXT:    movl %edi, %edx
     53 ; AVX-NEXT:    shlb $3, %dl
     54 ; AVX-NEXT:    andb $32, %dl
     55 ; AVX-NEXT:    orb %cl, %dl
     56 ; AVX-NEXT:    movl %edi, %ecx
     57 ; AVX-NEXT:    addb %cl, %cl
     58 ; AVX-NEXT:    andb $16, %cl
     59 ; AVX-NEXT:    orb %dl, %cl
     60 ; AVX-NEXT:    movl %edi, %edx
     61 ; AVX-NEXT:    shrb %dl
     62 ; AVX-NEXT:    andb $8, %dl
     63 ; AVX-NEXT:    orb %cl, %dl
     64 ; AVX-NEXT:    movl %edi, %ecx
     65 ; AVX-NEXT:    shrb $3, %cl
     66 ; AVX-NEXT:    andb $4, %cl
     67 ; AVX-NEXT:    orb %dl, %cl
     68 ; AVX-NEXT:    movl %edi, %edx
     69 ; AVX-NEXT:    shrb $5, %dl
     70 ; AVX-NEXT:    andb $2, %dl
     71 ; AVX-NEXT:    orb %cl, %dl
     72 ; AVX-NEXT:    shrb $7, %dil
     73 ; AVX-NEXT:    orb %dl, %dil
     74 ; AVX-NEXT:    orb %al, %dil
     75 ; AVX-NEXT:    movl %edi, %eax
     76 ; AVX-NEXT:    retq
     77 ;
     78 ; XOP-LABEL: test_bitreverse_i8:
     79 ; XOP:       # BB#0:
     80 ; XOP-NEXT:    vmovd %edi, %xmm0
     81 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
     82 ; XOP-NEXT:    vpextrb $0, %xmm0, %eax
     83 ; XOP-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
     84 ; XOP-NEXT:    retq
     85   %b = call i8 @llvm.bitreverse.i8(i8 %a)
     86   ret i8 %b
     87 }
     88 
     89 define i16 @test_bitreverse_i16(i16 %a) nounwind {
     90 ; SSE-LABEL: test_bitreverse_i16:
     91 ; SSE:       # BB#0:
     92 ; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
     93 ; SSE-NEXT:    movl %edi, %ecx
     94 ; SSE-NEXT:    andl $32768, %ecx # imm = 0x8000
     95 ; SSE-NEXT:    movl %edi, %eax
     96 ; SSE-NEXT:    shll $15, %eax
     97 ; SSE-NEXT:    movl %edi, %edx
     98 ; SSE-NEXT:    andl $2, %edx
     99 ; SSE-NEXT:    shll $13, %edx
    100 ; SSE-NEXT:    leal (%rdx,%rax), %eax
    101 ; SSE-NEXT:    movl %edi, %edx
    102 ; SSE-NEXT:    andl $4, %edx
    103 ; SSE-NEXT:    shll $11, %edx
    104 ; SSE-NEXT:    orl %edx, %eax
    105 ; SSE-NEXT:    movl %edi, %edx
    106 ; SSE-NEXT:    andl $8, %edx
    107 ; SSE-NEXT:    shll $9, %edx
    108 ; SSE-NEXT:    orl %edx, %eax
    109 ; SSE-NEXT:    movl %edi, %edx
    110 ; SSE-NEXT:    andl $16, %edx
    111 ; SSE-NEXT:    shll $7, %edx
    112 ; SSE-NEXT:    orl %edx, %eax
    113 ; SSE-NEXT:    movl %edi, %edx
    114 ; SSE-NEXT:    andl $32, %edx
    115 ; SSE-NEXT:    shll $5, %edx
    116 ; SSE-NEXT:    orl %edx, %eax
    117 ; SSE-NEXT:    movl %edi, %edx
    118 ; SSE-NEXT:    andl $64, %edx
    119 ; SSE-NEXT:    shll $3, %edx
    120 ; SSE-NEXT:    leal (%rdi,%rdi), %esi
    121 ; SSE-NEXT:    andl $256, %esi # imm = 0x100
    122 ; SSE-NEXT:    orl %edx, %esi
    123 ; SSE-NEXT:    movl %edi, %edx
    124 ; SSE-NEXT:    shrl %edx
    125 ; SSE-NEXT:    andl $128, %edx
    126 ; SSE-NEXT:    orl %esi, %edx
    127 ; SSE-NEXT:    movl %edi, %esi
    128 ; SSE-NEXT:    shrl $3, %esi
    129 ; SSE-NEXT:    andl $64, %esi
    130 ; SSE-NEXT:    orl %edx, %esi
    131 ; SSE-NEXT:    movl %edi, %edx
    132 ; SSE-NEXT:    shrl $5, %edx
    133 ; SSE-NEXT:    andl $32, %edx
    134 ; SSE-NEXT:    orl %esi, %edx
    135 ; SSE-NEXT:    movl %edi, %esi
    136 ; SSE-NEXT:    shrl $7, %esi
    137 ; SSE-NEXT:    andl $16, %esi
    138 ; SSE-NEXT:    orl %edx, %esi
    139 ; SSE-NEXT:    movl %edi, %edx
    140 ; SSE-NEXT:    shrl $9, %edx
    141 ; SSE-NEXT:    andl $8, %edx
    142 ; SSE-NEXT:    orl %esi, %edx
    143 ; SSE-NEXT:    movl %edi, %esi
    144 ; SSE-NEXT:    shrl $11, %esi
    145 ; SSE-NEXT:    andl $4, %esi
    146 ; SSE-NEXT:    orl %edx, %esi
    147 ; SSE-NEXT:    shrl $13, %edi
    148 ; SSE-NEXT:    andl $2, %edi
    149 ; SSE-NEXT:    orl %esi, %edi
    150 ; SSE-NEXT:    shrl $15, %ecx
    151 ; SSE-NEXT:    orl %edi, %ecx
    152 ; SSE-NEXT:    orl %ecx, %eax
    153 ; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
    154 ; SSE-NEXT:    retq
    155 ;
    156 ; AVX-LABEL: test_bitreverse_i16:
    157 ; AVX:       # BB#0:
    158 ; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
    159 ; AVX-NEXT:    movl %edi, %ecx
    160 ; AVX-NEXT:    andl $32768, %ecx # imm = 0x8000
    161 ; AVX-NEXT:    movl %edi, %eax
    162 ; AVX-NEXT:    shll $15, %eax
    163 ; AVX-NEXT:    movl %edi, %edx
    164 ; AVX-NEXT:    andl $2, %edx
    165 ; AVX-NEXT:    shll $13, %edx
    166 ; AVX-NEXT:    leal (%rdx,%rax), %eax
    167 ; AVX-NEXT:    movl %edi, %edx
    168 ; AVX-NEXT:    andl $4, %edx
    169 ; AVX-NEXT:    shll $11, %edx
    170 ; AVX-NEXT:    orl %edx, %eax
    171 ; AVX-NEXT:    movl %edi, %edx
    172 ; AVX-NEXT:    andl $8, %edx
    173 ; AVX-NEXT:    shll $9, %edx
    174 ; AVX-NEXT:    orl %edx, %eax
    175 ; AVX-NEXT:    movl %edi, %edx
    176 ; AVX-NEXT:    andl $16, %edx
    177 ; AVX-NEXT:    shll $7, %edx
    178 ; AVX-NEXT:    orl %edx, %eax
    179 ; AVX-NEXT:    movl %edi, %edx
    180 ; AVX-NEXT:    andl $32, %edx
    181 ; AVX-NEXT:    shll $5, %edx
    182 ; AVX-NEXT:    orl %edx, %eax
    183 ; AVX-NEXT:    movl %edi, %edx
    184 ; AVX-NEXT:    andl $64, %edx
    185 ; AVX-NEXT:    shll $3, %edx
    186 ; AVX-NEXT:    leal (%rdi,%rdi), %esi
    187 ; AVX-NEXT:    andl $256, %esi # imm = 0x100
    188 ; AVX-NEXT:    orl %edx, %esi
    189 ; AVX-NEXT:    movl %edi, %edx
    190 ; AVX-NEXT:    shrl %edx
    191 ; AVX-NEXT:    andl $128, %edx
    192 ; AVX-NEXT:    orl %esi, %edx
    193 ; AVX-NEXT:    movl %edi, %esi
    194 ; AVX-NEXT:    shrl $3, %esi
    195 ; AVX-NEXT:    andl $64, %esi
    196 ; AVX-NEXT:    orl %edx, %esi
    197 ; AVX-NEXT:    movl %edi, %edx
    198 ; AVX-NEXT:    shrl $5, %edx
    199 ; AVX-NEXT:    andl $32, %edx
    200 ; AVX-NEXT:    orl %esi, %edx
    201 ; AVX-NEXT:    movl %edi, %esi
    202 ; AVX-NEXT:    shrl $7, %esi
    203 ; AVX-NEXT:    andl $16, %esi
    204 ; AVX-NEXT:    orl %edx, %esi
    205 ; AVX-NEXT:    movl %edi, %edx
    206 ; AVX-NEXT:    shrl $9, %edx
    207 ; AVX-NEXT:    andl $8, %edx
    208 ; AVX-NEXT:    orl %esi, %edx
    209 ; AVX-NEXT:    movl %edi, %esi
    210 ; AVX-NEXT:    shrl $11, %esi
    211 ; AVX-NEXT:    andl $4, %esi
    212 ; AVX-NEXT:    orl %edx, %esi
    213 ; AVX-NEXT:    shrl $13, %edi
    214 ; AVX-NEXT:    andl $2, %edi
    215 ; AVX-NEXT:    orl %esi, %edi
    216 ; AVX-NEXT:    shrl $15, %ecx
    217 ; AVX-NEXT:    orl %edi, %ecx
    218 ; AVX-NEXT:    orl %ecx, %eax
    219 ; AVX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
    220 ; AVX-NEXT:    retq
    221 ;
    222 ; XOP-LABEL: test_bitreverse_i16:
    223 ; XOP:       # BB#0:
    224 ; XOP-NEXT:    vmovd %edi, %xmm0
    225 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    226 ; XOP-NEXT:    vmovd %xmm0, %eax
    227 ; XOP-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
    228 ; XOP-NEXT:    retq
    229   %b = call i16 @llvm.bitreverse.i16(i16 %a)
    230   ret i16 %b
    231 }
    232 
    233 define i32 @test_bitreverse_i32(i32 %a) nounwind {
    234 ; SSE-LABEL: test_bitreverse_i32:
    235 ; SSE:       # BB#0:
    236 ; SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
    237 ; SSE-NEXT:    movl %edi, %eax
    238 ; SSE-NEXT:    shll $31, %eax
    239 ; SSE-NEXT:    movl %edi, %ecx
    240 ; SSE-NEXT:    andl $2, %ecx
    241 ; SSE-NEXT:    shll $29, %ecx
    242 ; SSE-NEXT:    leal (%rcx,%rax), %eax
    243 ; SSE-NEXT:    movl %edi, %ecx
    244 ; SSE-NEXT:    andl $4, %ecx
    245 ; SSE-NEXT:    shll $27, %ecx
    246 ; SSE-NEXT:    orl %ecx, %eax
    247 ; SSE-NEXT:    movl %edi, %ecx
    248 ; SSE-NEXT:    andl $8, %ecx
    249 ; SSE-NEXT:    shll $25, %ecx
    250 ; SSE-NEXT:    orl %ecx, %eax
    251 ; SSE-NEXT:    movl %edi, %ecx
    252 ; SSE-NEXT:    andl $16, %ecx
    253 ; SSE-NEXT:    shll $23, %ecx
    254 ; SSE-NEXT:    orl %ecx, %eax
    255 ; SSE-NEXT:    movl %edi, %ecx
    256 ; SSE-NEXT:    andl $32, %ecx
    257 ; SSE-NEXT:    shll $21, %ecx
    258 ; SSE-NEXT:    orl %ecx, %eax
    259 ; SSE-NEXT:    movl %edi, %ecx
    260 ; SSE-NEXT:    andl $64, %ecx
    261 ; SSE-NEXT:    shll $19, %ecx
    262 ; SSE-NEXT:    movl %edi, %edx
    263 ; SSE-NEXT:    shll $17, %edx
    264 ; SSE-NEXT:    andl $16777216, %edx # imm = 0x1000000
    265 ; SSE-NEXT:    orl %ecx, %edx
    266 ; SSE-NEXT:    movl %edi, %ecx
    267 ; SSE-NEXT:    shll $15, %ecx
    268 ; SSE-NEXT:    andl $8388608, %ecx # imm = 0x800000
    269 ; SSE-NEXT:    orl %edx, %ecx
    270 ; SSE-NEXT:    movl %edi, %edx
    271 ; SSE-NEXT:    shll $13, %edx
    272 ; SSE-NEXT:    andl $4194304, %edx # imm = 0x400000
    273 ; SSE-NEXT:    orl %ecx, %edx
    274 ; SSE-NEXT:    movl %edi, %ecx
    275 ; SSE-NEXT:    shll $11, %ecx
    276 ; SSE-NEXT:    andl $2097152, %ecx # imm = 0x200000
    277 ; SSE-NEXT:    orl %edx, %ecx
    278 ; SSE-NEXT:    movl %edi, %edx
    279 ; SSE-NEXT:    shll $9, %edx
    280 ; SSE-NEXT:    andl $1048576, %edx # imm = 0x100000
    281 ; SSE-NEXT:    orl %ecx, %edx
    282 ; SSE-NEXT:    movl %edi, %ecx
    283 ; SSE-NEXT:    shll $7, %ecx
    284 ; SSE-NEXT:    andl $524288, %ecx # imm = 0x80000
    285 ; SSE-NEXT:    orl %edx, %ecx
    286 ; SSE-NEXT:    movl %edi, %edx
    287 ; SSE-NEXT:    shll $5, %edx
    288 ; SSE-NEXT:    andl $262144, %edx # imm = 0x40000
    289 ; SSE-NEXT:    orl %ecx, %edx
    290 ; SSE-NEXT:    leal (,%rdi,8), %ecx
    291 ; SSE-NEXT:    andl $131072, %ecx # imm = 0x20000
    292 ; SSE-NEXT:    orl %edx, %ecx
    293 ; SSE-NEXT:    leal (%rdi,%rdi), %edx
    294 ; SSE-NEXT:    andl $65536, %edx # imm = 0x10000
    295 ; SSE-NEXT:    orl %ecx, %edx
    296 ; SSE-NEXT:    movl %edi, %ecx
    297 ; SSE-NEXT:    shrl %ecx
    298 ; SSE-NEXT:    andl $32768, %ecx # imm = 0x8000
    299 ; SSE-NEXT:    orl %edx, %ecx
    300 ; SSE-NEXT:    movl %edi, %edx
    301 ; SSE-NEXT:    shrl $3, %edx
    302 ; SSE-NEXT:    andl $16384, %edx # imm = 0x4000
    303 ; SSE-NEXT:    orl %ecx, %edx
    304 ; SSE-NEXT:    movl %edi, %ecx
    305 ; SSE-NEXT:    shrl $5, %ecx
    306 ; SSE-NEXT:    andl $8192, %ecx # imm = 0x2000
    307 ; SSE-NEXT:    orl %edx, %ecx
    308 ; SSE-NEXT:    movl %edi, %edx
    309 ; SSE-NEXT:    shrl $7, %edx
    310 ; SSE-NEXT:    andl $4096, %edx # imm = 0x1000
    311 ; SSE-NEXT:    orl %ecx, %edx
    312 ; SSE-NEXT:    movl %edi, %ecx
    313 ; SSE-NEXT:    shrl $9, %ecx
    314 ; SSE-NEXT:    andl $2048, %ecx # imm = 0x800
    315 ; SSE-NEXT:    orl %edx, %ecx
    316 ; SSE-NEXT:    movl %edi, %edx
    317 ; SSE-NEXT:    shrl $11, %edx
    318 ; SSE-NEXT:    andl $1024, %edx # imm = 0x400
    319 ; SSE-NEXT:    orl %ecx, %edx
    320 ; SSE-NEXT:    movl %edi, %ecx
    321 ; SSE-NEXT:    shrl $13, %ecx
    322 ; SSE-NEXT:    andl $512, %ecx # imm = 0x200
    323 ; SSE-NEXT:    orl %edx, %ecx
    324 ; SSE-NEXT:    movl %edi, %edx
    325 ; SSE-NEXT:    shrl $15, %edx
    326 ; SSE-NEXT:    andl $256, %edx # imm = 0x100
    327 ; SSE-NEXT:    orl %ecx, %edx
    328 ; SSE-NEXT:    movl %edi, %ecx
    329 ; SSE-NEXT:    shrl $17, %ecx
    330 ; SSE-NEXT:    andl $128, %ecx
    331 ; SSE-NEXT:    orl %edx, %ecx
    332 ; SSE-NEXT:    movl %edi, %edx
    333 ; SSE-NEXT:    shrl $19, %edx
    334 ; SSE-NEXT:    andl $64, %edx
    335 ; SSE-NEXT:    orl %ecx, %edx
    336 ; SSE-NEXT:    movl %edi, %ecx
    337 ; SSE-NEXT:    shrl $21, %ecx
    338 ; SSE-NEXT:    andl $32, %ecx
    339 ; SSE-NEXT:    orl %edx, %ecx
    340 ; SSE-NEXT:    movl %edi, %edx
    341 ; SSE-NEXT:    shrl $23, %edx
    342 ; SSE-NEXT:    andl $16, %edx
    343 ; SSE-NEXT:    orl %ecx, %edx
    344 ; SSE-NEXT:    movl %edi, %ecx
    345 ; SSE-NEXT:    shrl $25, %ecx
    346 ; SSE-NEXT:    andl $8, %ecx
    347 ; SSE-NEXT:    orl %edx, %ecx
    348 ; SSE-NEXT:    movl %edi, %edx
    349 ; SSE-NEXT:    shrl $27, %edx
    350 ; SSE-NEXT:    andl $4, %edx
    351 ; SSE-NEXT:    orl %ecx, %edx
    352 ; SSE-NEXT:    movl %edi, %ecx
    353 ; SSE-NEXT:    shrl $29, %ecx
    354 ; SSE-NEXT:    andl $2, %ecx
    355 ; SSE-NEXT:    orl %edx, %ecx
    356 ; SSE-NEXT:    shrl $31, %edi
    357 ; SSE-NEXT:    orl %ecx, %edi
    358 ; SSE-NEXT:    orl %edi, %eax
    359 ; SSE-NEXT:    retq
    360 ;
    361 ; AVX-LABEL: test_bitreverse_i32:
    362 ; AVX:       # BB#0:
    363 ; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
    364 ; AVX-NEXT:    movl %edi, %eax
    365 ; AVX-NEXT:    shll $31, %eax
    366 ; AVX-NEXT:    movl %edi, %ecx
    367 ; AVX-NEXT:    andl $2, %ecx
    368 ; AVX-NEXT:    shll $29, %ecx
    369 ; AVX-NEXT:    leal (%rcx,%rax), %eax
    370 ; AVX-NEXT:    movl %edi, %ecx
    371 ; AVX-NEXT:    andl $4, %ecx
    372 ; AVX-NEXT:    shll $27, %ecx
    373 ; AVX-NEXT:    orl %ecx, %eax
    374 ; AVX-NEXT:    movl %edi, %ecx
    375 ; AVX-NEXT:    andl $8, %ecx
    376 ; AVX-NEXT:    shll $25, %ecx
    377 ; AVX-NEXT:    orl %ecx, %eax
    378 ; AVX-NEXT:    movl %edi, %ecx
    379 ; AVX-NEXT:    andl $16, %ecx
    380 ; AVX-NEXT:    shll $23, %ecx
    381 ; AVX-NEXT:    orl %ecx, %eax
    382 ; AVX-NEXT:    movl %edi, %ecx
    383 ; AVX-NEXT:    andl $32, %ecx
    384 ; AVX-NEXT:    shll $21, %ecx
    385 ; AVX-NEXT:    orl %ecx, %eax
    386 ; AVX-NEXT:    movl %edi, %ecx
    387 ; AVX-NEXT:    andl $64, %ecx
    388 ; AVX-NEXT:    shll $19, %ecx
    389 ; AVX-NEXT:    movl %edi, %edx
    390 ; AVX-NEXT:    shll $17, %edx
    391 ; AVX-NEXT:    andl $16777216, %edx # imm = 0x1000000
    392 ; AVX-NEXT:    orl %ecx, %edx
    393 ; AVX-NEXT:    movl %edi, %ecx
    394 ; AVX-NEXT:    shll $15, %ecx
    395 ; AVX-NEXT:    andl $8388608, %ecx # imm = 0x800000
    396 ; AVX-NEXT:    orl %edx, %ecx
    397 ; AVX-NEXT:    movl %edi, %edx
    398 ; AVX-NEXT:    shll $13, %edx
    399 ; AVX-NEXT:    andl $4194304, %edx # imm = 0x400000
    400 ; AVX-NEXT:    orl %ecx, %edx
    401 ; AVX-NEXT:    movl %edi, %ecx
    402 ; AVX-NEXT:    shll $11, %ecx
    403 ; AVX-NEXT:    andl $2097152, %ecx # imm = 0x200000
    404 ; AVX-NEXT:    orl %edx, %ecx
    405 ; AVX-NEXT:    movl %edi, %edx
    406 ; AVX-NEXT:    shll $9, %edx
    407 ; AVX-NEXT:    andl $1048576, %edx # imm = 0x100000
    408 ; AVX-NEXT:    orl %ecx, %edx
    409 ; AVX-NEXT:    movl %edi, %ecx
    410 ; AVX-NEXT:    shll $7, %ecx
    411 ; AVX-NEXT:    andl $524288, %ecx # imm = 0x80000
    412 ; AVX-NEXT:    orl %edx, %ecx
    413 ; AVX-NEXT:    movl %edi, %edx
    414 ; AVX-NEXT:    shll $5, %edx
    415 ; AVX-NEXT:    andl $262144, %edx # imm = 0x40000
    416 ; AVX-NEXT:    orl %ecx, %edx
    417 ; AVX-NEXT:    leal (,%rdi,8), %ecx
    418 ; AVX-NEXT:    andl $131072, %ecx # imm = 0x20000
    419 ; AVX-NEXT:    orl %edx, %ecx
    420 ; AVX-NEXT:    leal (%rdi,%rdi), %edx
    421 ; AVX-NEXT:    andl $65536, %edx # imm = 0x10000
    422 ; AVX-NEXT:    orl %ecx, %edx
    423 ; AVX-NEXT:    movl %edi, %ecx
    424 ; AVX-NEXT:    shrl %ecx
    425 ; AVX-NEXT:    andl $32768, %ecx # imm = 0x8000
    426 ; AVX-NEXT:    orl %edx, %ecx
    427 ; AVX-NEXT:    movl %edi, %edx
    428 ; AVX-NEXT:    shrl $3, %edx
    429 ; AVX-NEXT:    andl $16384, %edx # imm = 0x4000
    430 ; AVX-NEXT:    orl %ecx, %edx
    431 ; AVX-NEXT:    movl %edi, %ecx
    432 ; AVX-NEXT:    shrl $5, %ecx
    433 ; AVX-NEXT:    andl $8192, %ecx # imm = 0x2000
    434 ; AVX-NEXT:    orl %edx, %ecx
    435 ; AVX-NEXT:    movl %edi, %edx
    436 ; AVX-NEXT:    shrl $7, %edx
    437 ; AVX-NEXT:    andl $4096, %edx # imm = 0x1000
    438 ; AVX-NEXT:    orl %ecx, %edx
    439 ; AVX-NEXT:    movl %edi, %ecx
    440 ; AVX-NEXT:    shrl $9, %ecx
    441 ; AVX-NEXT:    andl $2048, %ecx # imm = 0x800
    442 ; AVX-NEXT:    orl %edx, %ecx
    443 ; AVX-NEXT:    movl %edi, %edx
    444 ; AVX-NEXT:    shrl $11, %edx
    445 ; AVX-NEXT:    andl $1024, %edx # imm = 0x400
    446 ; AVX-NEXT:    orl %ecx, %edx
    447 ; AVX-NEXT:    movl %edi, %ecx
    448 ; AVX-NEXT:    shrl $13, %ecx
    449 ; AVX-NEXT:    andl $512, %ecx # imm = 0x200
    450 ; AVX-NEXT:    orl %edx, %ecx
    451 ; AVX-NEXT:    movl %edi, %edx
    452 ; AVX-NEXT:    shrl $15, %edx
    453 ; AVX-NEXT:    andl $256, %edx # imm = 0x100
    454 ; AVX-NEXT:    orl %ecx, %edx
    455 ; AVX-NEXT:    movl %edi, %ecx
    456 ; AVX-NEXT:    shrl $17, %ecx
    457 ; AVX-NEXT:    andl $128, %ecx
    458 ; AVX-NEXT:    orl %edx, %ecx
    459 ; AVX-NEXT:    movl %edi, %edx
    460 ; AVX-NEXT:    shrl $19, %edx
    461 ; AVX-NEXT:    andl $64, %edx
    462 ; AVX-NEXT:    orl %ecx, %edx
    463 ; AVX-NEXT:    movl %edi, %ecx
    464 ; AVX-NEXT:    shrl $21, %ecx
    465 ; AVX-NEXT:    andl $32, %ecx
    466 ; AVX-NEXT:    orl %edx, %ecx
    467 ; AVX-NEXT:    movl %edi, %edx
    468 ; AVX-NEXT:    shrl $23, %edx
    469 ; AVX-NEXT:    andl $16, %edx
    470 ; AVX-NEXT:    orl %ecx, %edx
    471 ; AVX-NEXT:    movl %edi, %ecx
    472 ; AVX-NEXT:    shrl $25, %ecx
    473 ; AVX-NEXT:    andl $8, %ecx
    474 ; AVX-NEXT:    orl %edx, %ecx
    475 ; AVX-NEXT:    movl %edi, %edx
    476 ; AVX-NEXT:    shrl $27, %edx
    477 ; AVX-NEXT:    andl $4, %edx
    478 ; AVX-NEXT:    orl %ecx, %edx
    479 ; AVX-NEXT:    movl %edi, %ecx
    480 ; AVX-NEXT:    shrl $29, %ecx
    481 ; AVX-NEXT:    andl $2, %ecx
    482 ; AVX-NEXT:    orl %edx, %ecx
    483 ; AVX-NEXT:    shrl $31, %edi
    484 ; AVX-NEXT:    orl %ecx, %edi
    485 ; AVX-NEXT:    orl %edi, %eax
    486 ; AVX-NEXT:    retq
    487 ;
    488 ; XOP-LABEL: test_bitreverse_i32:
    489 ; XOP:       # BB#0:
    490 ; XOP-NEXT:    vmovd %edi, %xmm0
    491 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    492 ; XOP-NEXT:    vmovd %xmm0, %eax
    493 ; XOP-NEXT:    retq
    494   %b = call i32 @llvm.bitreverse.i32(i32 %a)
    495   ret i32 %b
    496 }
    497 
    498 define i64 @test_bitreverse_i64(i64 %a) nounwind {
    499 ; SSE-LABEL: test_bitreverse_i64:
    500 ; SSE:       # BB#0:
    501 ; SSE-NEXT:    leaq (%rdi,%rdi), %rax
    502 ; SSE-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
    503 ; SSE-NEXT:    andq %rax, %rcx
    504 ; SSE-NEXT:    movq %rdi, %rax
    505 ; SSE-NEXT:    shlq $63, %rax
    506 ; SSE-NEXT:    movq %rdi, %rdx
    507 ; SSE-NEXT:    andq $2, %rdx
    508 ; SSE-NEXT:    shlq $61, %rdx
    509 ; SSE-NEXT:    leaq (%rdx,%rax), %rax
    510 ; SSE-NEXT:    movq %rdi, %rdx
    511 ; SSE-NEXT:    andq $4, %rdx
    512 ; SSE-NEXT:    shlq $59, %rdx
    513 ; SSE-NEXT:    orq %rdx, %rax
    514 ; SSE-NEXT:    movq %rdi, %rdx
    515 ; SSE-NEXT:    andq $8, %rdx
    516 ; SSE-NEXT:    shlq $57, %rdx
    517 ; SSE-NEXT:    orq %rdx, %rax
    518 ; SSE-NEXT:    movq %rdi, %rdx
    519 ; SSE-NEXT:    andq $16, %rdx
    520 ; SSE-NEXT:    shlq $55, %rdx
    521 ; SSE-NEXT:    orq %rdx, %rax
    522 ; SSE-NEXT:    movq %rdi, %rdx
    523 ; SSE-NEXT:    andq $32, %rdx
    524 ; SSE-NEXT:    shlq $53, %rdx
    525 ; SSE-NEXT:    orq %rdx, %rax
    526 ; SSE-NEXT:    movq %rdi, %rdx
    527 ; SSE-NEXT:    andq $64, %rdx
    528 ; SSE-NEXT:    shlq $51, %rdx
    529 ; SSE-NEXT:    movq %rdi, %rsi
    530 ; SSE-NEXT:    andq $128, %rsi
    531 ; SSE-NEXT:    shlq $49, %rsi
    532 ; SSE-NEXT:    orq %rdx, %rsi
    533 ; SSE-NEXT:    movq %rdi, %rdx
    534 ; SSE-NEXT:    andq $256, %rdx # imm = 0x100
    535 ; SSE-NEXT:    shlq $47, %rdx
    536 ; SSE-NEXT:    orq %rsi, %rdx
    537 ; SSE-NEXT:    movq %rdi, %rsi
    538 ; SSE-NEXT:    andq $512, %rsi # imm = 0x200
    539 ; SSE-NEXT:    shlq $45, %rsi
    540 ; SSE-NEXT:    orq %rdx, %rsi
    541 ; SSE-NEXT:    movq %rdi, %rdx
    542 ; SSE-NEXT:    andq $1024, %rdx # imm = 0x400
    543 ; SSE-NEXT:    shlq $43, %rdx
    544 ; SSE-NEXT:    orq %rsi, %rdx
    545 ; SSE-NEXT:    movq %rdi, %rsi
    546 ; SSE-NEXT:    andq $2048, %rsi # imm = 0x800
    547 ; SSE-NEXT:    shlq $41, %rsi
    548 ; SSE-NEXT:    orq %rdx, %rsi
    549 ; SSE-NEXT:    movq %rdi, %rdx
    550 ; SSE-NEXT:    andq $4096, %rdx # imm = 0x1000
    551 ; SSE-NEXT:    shlq $39, %rdx
    552 ; SSE-NEXT:    orq %rsi, %rdx
    553 ; SSE-NEXT:    movq %rdi, %rsi
    554 ; SSE-NEXT:    andq $8192, %rsi # imm = 0x2000
    555 ; SSE-NEXT:    shlq $37, %rsi
    556 ; SSE-NEXT:    orq %rdx, %rsi
    557 ; SSE-NEXT:    movq %rdi, %rdx
    558 ; SSE-NEXT:    andq $16384, %rdx # imm = 0x4000
    559 ; SSE-NEXT:    shlq $35, %rdx
    560 ; SSE-NEXT:    orq %rsi, %rdx
    561 ; SSE-NEXT:    movq %rdi, %rsi
    562 ; SSE-NEXT:    andq $32768, %rsi # imm = 0x8000
    563 ; SSE-NEXT:    shlq $33, %rsi
    564 ; SSE-NEXT:    orq %rdx, %rsi
    565 ; SSE-NEXT:    movq %rdi, %rdx
    566 ; SSE-NEXT:    andq $65536, %rdx # imm = 0x10000
    567 ; SSE-NEXT:    shlq $31, %rdx
    568 ; SSE-NEXT:    orq %rsi, %rdx
    569 ; SSE-NEXT:    movq %rdi, %rsi
    570 ; SSE-NEXT:    andq $131072, %rsi # imm = 0x20000
    571 ; SSE-NEXT:    shlq $29, %rsi
    572 ; SSE-NEXT:    orq %rdx, %rsi
    573 ; SSE-NEXT:    movq %rdi, %rdx
    574 ; SSE-NEXT:    andq $262144, %rdx # imm = 0x40000
    575 ; SSE-NEXT:    shlq $27, %rdx
    576 ; SSE-NEXT:    orq %rsi, %rdx
    577 ; SSE-NEXT:    movq %rdi, %rsi
    578 ; SSE-NEXT:    andq $524288, %rsi # imm = 0x80000
    579 ; SSE-NEXT:    shlq $25, %rsi
    580 ; SSE-NEXT:    orq %rdx, %rsi
    581 ; SSE-NEXT:    movq %rdi, %rdx
    582 ; SSE-NEXT:    andq $1048576, %rdx # imm = 0x100000
    583 ; SSE-NEXT:    shlq $23, %rdx
    584 ; SSE-NEXT:    orq %rsi, %rdx
    585 ; SSE-NEXT:    movq %rdi, %rsi
    586 ; SSE-NEXT:    andq $2097152, %rsi # imm = 0x200000
    587 ; SSE-NEXT:    shlq $21, %rsi
    588 ; SSE-NEXT:    orq %rdx, %rsi
    589 ; SSE-NEXT:    movq %rdi, %rdx
    590 ; SSE-NEXT:    andq $4194304, %rdx # imm = 0x400000
    591 ; SSE-NEXT:    shlq $19, %rdx
    592 ; SSE-NEXT:    orq %rsi, %rdx
    593 ; SSE-NEXT:    movq %rdi, %rsi
    594 ; SSE-NEXT:    andq $8388608, %rsi # imm = 0x800000
    595 ; SSE-NEXT:    shlq $17, %rsi
    596 ; SSE-NEXT:    orq %rdx, %rsi
    597 ; SSE-NEXT:    movq %rdi, %rdx
    598 ; SSE-NEXT:    andq $16777216, %rdx # imm = 0x1000000
    599 ; SSE-NEXT:    shlq $15, %rdx
    600 ; SSE-NEXT:    orq %rsi, %rdx
    601 ; SSE-NEXT:    movq %rdi, %rsi
    602 ; SSE-NEXT:    andq $33554432, %rsi # imm = 0x2000000
    603 ; SSE-NEXT:    shlq $13, %rsi
    604 ; SSE-NEXT:    orq %rdx, %rsi
    605 ; SSE-NEXT:    movq %rdi, %rdx
    606 ; SSE-NEXT:    andq $67108864, %rdx # imm = 0x4000000
    607 ; SSE-NEXT:    shlq $11, %rdx
    608 ; SSE-NEXT:    orq %rsi, %rdx
    609 ; SSE-NEXT:    movq %rdi, %rsi
    610 ; SSE-NEXT:    andq $134217728, %rsi # imm = 0x8000000
    611 ; SSE-NEXT:    shlq $9, %rsi
    612 ; SSE-NEXT:    orq %rdx, %rsi
    613 ; SSE-NEXT:    movq %rdi, %rdx
    614 ; SSE-NEXT:    andq $268435456, %rdx # imm = 0x10000000
    615 ; SSE-NEXT:    shlq $7, %rdx
    616 ; SSE-NEXT:    orq %rsi, %rdx
    617 ; SSE-NEXT:    movq %rdi, %rsi
    618 ; SSE-NEXT:    andq $536870912, %rsi # imm = 0x20000000
    619 ; SSE-NEXT:    shlq $5, %rsi
    620 ; SSE-NEXT:    orq %rdx, %rsi
    621 ; SSE-NEXT:    movq %rdi, %rdx
    622 ; SSE-NEXT:    andq $1073741824, %rdx # imm = 0x40000000
    623 ; SSE-NEXT:    shlq $3, %rdx
    624 ; SSE-NEXT:    orq %rsi, %rdx
    625 ; SSE-NEXT:    orq %rcx, %rdx
    626 ; SSE-NEXT:    movq %rdi, %rcx
    627 ; SSE-NEXT:    shrq %rcx
    628 ; SSE-NEXT:    andl $-2147483648, %ecx # imm = 0x80000000
    629 ; SSE-NEXT:    orq %rdx, %rcx
    630 ; SSE-NEXT:    movq %rdi, %rdx
    631 ; SSE-NEXT:    shrq $3, %rdx
    632 ; SSE-NEXT:    andl $1073741824, %edx # imm = 0x40000000
    633 ; SSE-NEXT:    orq %rcx, %rdx
    634 ; SSE-NEXT:    movq %rdi, %rcx
    635 ; SSE-NEXT:    shrq $5, %rcx
    636 ; SSE-NEXT:    andl $536870912, %ecx # imm = 0x20000000
    637 ; SSE-NEXT:    orq %rdx, %rcx
    638 ; SSE-NEXT:    movq %rdi, %rdx
    639 ; SSE-NEXT:    shrq $7, %rdx
    640 ; SSE-NEXT:    andl $268435456, %edx # imm = 0x10000000
    641 ; SSE-NEXT:    orq %rcx, %rdx
    642 ; SSE-NEXT:    movq %rdi, %rcx
    643 ; SSE-NEXT:    shrq $9, %rcx
    644 ; SSE-NEXT:    andl $134217728, %ecx # imm = 0x8000000
    645 ; SSE-NEXT:    orq %rdx, %rcx
    646 ; SSE-NEXT:    movq %rdi, %rdx
    647 ; SSE-NEXT:    shrq $11, %rdx
    648 ; SSE-NEXT:    andl $67108864, %edx # imm = 0x4000000
    649 ; SSE-NEXT:    orq %rcx, %rdx
    650 ; SSE-NEXT:    movq %rdi, %rcx
    651 ; SSE-NEXT:    shrq $13, %rcx
    652 ; SSE-NEXT:    andl $33554432, %ecx # imm = 0x2000000
    653 ; SSE-NEXT:    orq %rdx, %rcx
    654 ; SSE-NEXT:    movq %rdi, %rdx
    655 ; SSE-NEXT:    shrq $15, %rdx
    656 ; SSE-NEXT:    andl $16777216, %edx # imm = 0x1000000
    657 ; SSE-NEXT:    orq %rcx, %rdx
    658 ; SSE-NEXT:    movq %rdi, %rcx
    659 ; SSE-NEXT:    shrq $17, %rcx
    660 ; SSE-NEXT:    andl $8388608, %ecx # imm = 0x800000
    661 ; SSE-NEXT:    orq %rdx, %rcx
    662 ; SSE-NEXT:    movq %rdi, %rdx
    663 ; SSE-NEXT:    shrq $19, %rdx
    664 ; SSE-NEXT:    andl $4194304, %edx # imm = 0x400000
    665 ; SSE-NEXT:    orq %rcx, %rdx
    666 ; SSE-NEXT:    movq %rdi, %rcx
    667 ; SSE-NEXT:    shrq $21, %rcx
    668 ; SSE-NEXT:    andl $2097152, %ecx # imm = 0x200000
    669 ; SSE-NEXT:    orq %rdx, %rcx
    670 ; SSE-NEXT:    movq %rdi, %rdx
    671 ; SSE-NEXT:    shrq $23, %rdx
    672 ; SSE-NEXT:    andl $1048576, %edx # imm = 0x100000
    673 ; SSE-NEXT:    orq %rcx, %rdx
    674 ; SSE-NEXT:    movq %rdi, %rcx
    675 ; SSE-NEXT:    shrq $25, %rcx
    676 ; SSE-NEXT:    andl $524288, %ecx # imm = 0x80000
    677 ; SSE-NEXT:    orq %rdx, %rcx
    678 ; SSE-NEXT:    movq %rdi, %rdx
    679 ; SSE-NEXT:    shrq $27, %rdx
    680 ; SSE-NEXT:    andl $262144, %edx # imm = 0x40000
    681 ; SSE-NEXT:    orq %rcx, %rdx
    682 ; SSE-NEXT:    movq %rdi, %rcx
    683 ; SSE-NEXT:    shrq $29, %rcx
    684 ; SSE-NEXT:    andl $131072, %ecx # imm = 0x20000
    685 ; SSE-NEXT:    orq %rdx, %rcx
    686 ; SSE-NEXT:    movq %rdi, %rdx
    687 ; SSE-NEXT:    shrq $31, %rdx
    688 ; SSE-NEXT:    andl $65536, %edx # imm = 0x10000
    689 ; SSE-NEXT:    orq %rcx, %rdx
    690 ; SSE-NEXT:    movq %rdi, %rcx
    691 ; SSE-NEXT:    shrq $33, %rcx
    692 ; SSE-NEXT:    andl $32768, %ecx # imm = 0x8000
    693 ; SSE-NEXT:    orq %rdx, %rcx
    694 ; SSE-NEXT:    movq %rdi, %rdx
    695 ; SSE-NEXT:    shrq $35, %rdx
    696 ; SSE-NEXT:    andl $16384, %edx # imm = 0x4000
    697 ; SSE-NEXT:    orq %rcx, %rdx
    698 ; SSE-NEXT:    movq %rdi, %rcx
    699 ; SSE-NEXT:    shrq $37, %rcx
    700 ; SSE-NEXT:    andl $8192, %ecx # imm = 0x2000
    701 ; SSE-NEXT:    orq %rdx, %rcx
    702 ; SSE-NEXT:    movq %rdi, %rdx
    703 ; SSE-NEXT:    shrq $39, %rdx
    704 ; SSE-NEXT:    andl $4096, %edx # imm = 0x1000
    705 ; SSE-NEXT:    orq %rcx, %rdx
    706 ; SSE-NEXT:    movq %rdi, %rcx
    707 ; SSE-NEXT:    shrq $41, %rcx
    708 ; SSE-NEXT:    andl $2048, %ecx # imm = 0x800
    709 ; SSE-NEXT:    orq %rdx, %rcx
    710 ; SSE-NEXT:    movq %rdi, %rdx
    711 ; SSE-NEXT:    shrq $43, %rdx
    712 ; SSE-NEXT:    andl $1024, %edx # imm = 0x400
    713 ; SSE-NEXT:    orq %rcx, %rdx
    714 ; SSE-NEXT:    movq %rdi, %rcx
    715 ; SSE-NEXT:    shrq $45, %rcx
    716 ; SSE-NEXT:    andl $512, %ecx # imm = 0x200
    717 ; SSE-NEXT:    orq %rdx, %rcx
    718 ; SSE-NEXT:    movq %rdi, %rdx
    719 ; SSE-NEXT:    shrq $47, %rdx
    720 ; SSE-NEXT:    andl $256, %edx # imm = 0x100
    721 ; SSE-NEXT:    orq %rcx, %rdx
    722 ; SSE-NEXT:    movq %rdi, %rcx
    723 ; SSE-NEXT:    shrq $49, %rcx
    724 ; SSE-NEXT:    andl $128, %ecx
    725 ; SSE-NEXT:    orq %rdx, %rcx
    726 ; SSE-NEXT:    movq %rdi, %rdx
    727 ; SSE-NEXT:    shrq $51, %rdx
    728 ; SSE-NEXT:    andl $64, %edx
    729 ; SSE-NEXT:    orq %rcx, %rdx
    730 ; SSE-NEXT:    movq %rdi, %rcx
    731 ; SSE-NEXT:    shrq $53, %rcx
    732 ; SSE-NEXT:    andl $32, %ecx
    733 ; SSE-NEXT:    orq %rdx, %rcx
    734 ; SSE-NEXT:    movq %rdi, %rdx
    735 ; SSE-NEXT:    shrq $55, %rdx
    736 ; SSE-NEXT:    andl $16, %edx
    737 ; SSE-NEXT:    orq %rcx, %rdx
    738 ; SSE-NEXT:    movq %rdi, %rcx
    739 ; SSE-NEXT:    shrq $57, %rcx
    740 ; SSE-NEXT:    andl $8, %ecx
    741 ; SSE-NEXT:    orq %rdx, %rcx
    742 ; SSE-NEXT:    movq %rdi, %rdx
    743 ; SSE-NEXT:    shrq $59, %rdx
    744 ; SSE-NEXT:    andl $4, %edx
    745 ; SSE-NEXT:    orq %rcx, %rdx
    746 ; SSE-NEXT:    movq %rdi, %rcx
    747 ; SSE-NEXT:    shrq $61, %rcx
    748 ; SSE-NEXT:    andl $2, %ecx
    749 ; SSE-NEXT:    orq %rdx, %rcx
    750 ; SSE-NEXT:    shrq $63, %rdi
    751 ; SSE-NEXT:    orq %rcx, %rdi
    752 ; SSE-NEXT:    orq %rdi, %rax
    753 ; SSE-NEXT:    retq
    754 ;
    755 ; AVX-LABEL: test_bitreverse_i64:
    756 ; AVX:       # BB#0:
    757 ; AVX-NEXT:    leaq (%rdi,%rdi), %rax
    758 ; AVX-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
    759 ; AVX-NEXT:    andq %rax, %rcx
    760 ; AVX-NEXT:    movq %rdi, %rax
    761 ; AVX-NEXT:    shlq $63, %rax
    762 ; AVX-NEXT:    movq %rdi, %rdx
    763 ; AVX-NEXT:    andq $2, %rdx
    764 ; AVX-NEXT:    shlq $61, %rdx
    765 ; AVX-NEXT:    leaq (%rdx,%rax), %rax
    766 ; AVX-NEXT:    movq %rdi, %rdx
    767 ; AVX-NEXT:    andq $4, %rdx
    768 ; AVX-NEXT:    shlq $59, %rdx
    769 ; AVX-NEXT:    orq %rdx, %rax
    770 ; AVX-NEXT:    movq %rdi, %rdx
    771 ; AVX-NEXT:    andq $8, %rdx
    772 ; AVX-NEXT:    shlq $57, %rdx
    773 ; AVX-NEXT:    orq %rdx, %rax
    774 ; AVX-NEXT:    movq %rdi, %rdx
    775 ; AVX-NEXT:    andq $16, %rdx
    776 ; AVX-NEXT:    shlq $55, %rdx
    777 ; AVX-NEXT:    orq %rdx, %rax
    778 ; AVX-NEXT:    movq %rdi, %rdx
    779 ; AVX-NEXT:    andq $32, %rdx
    780 ; AVX-NEXT:    shlq $53, %rdx
    781 ; AVX-NEXT:    orq %rdx, %rax
    782 ; AVX-NEXT:    movq %rdi, %rdx
    783 ; AVX-NEXT:    andq $64, %rdx
    784 ; AVX-NEXT:    shlq $51, %rdx
    785 ; AVX-NEXT:    movq %rdi, %rsi
    786 ; AVX-NEXT:    andq $128, %rsi
    787 ; AVX-NEXT:    shlq $49, %rsi
    788 ; AVX-NEXT:    orq %rdx, %rsi
    789 ; AVX-NEXT:    movq %rdi, %rdx
    790 ; AVX-NEXT:    andq $256, %rdx # imm = 0x100
    791 ; AVX-NEXT:    shlq $47, %rdx
    792 ; AVX-NEXT:    orq %rsi, %rdx
    793 ; AVX-NEXT:    movq %rdi, %rsi
    794 ; AVX-NEXT:    andq $512, %rsi # imm = 0x200
    795 ; AVX-NEXT:    shlq $45, %rsi
    796 ; AVX-NEXT:    orq %rdx, %rsi
    797 ; AVX-NEXT:    movq %rdi, %rdx
    798 ; AVX-NEXT:    andq $1024, %rdx # imm = 0x400
    799 ; AVX-NEXT:    shlq $43, %rdx
    800 ; AVX-NEXT:    orq %rsi, %rdx
    801 ; AVX-NEXT:    movq %rdi, %rsi
    802 ; AVX-NEXT:    andq $2048, %rsi # imm = 0x800
    803 ; AVX-NEXT:    shlq $41, %rsi
    804 ; AVX-NEXT:    orq %rdx, %rsi
    805 ; AVX-NEXT:    movq %rdi, %rdx
    806 ; AVX-NEXT:    andq $4096, %rdx # imm = 0x1000
    807 ; AVX-NEXT:    shlq $39, %rdx
    808 ; AVX-NEXT:    orq %rsi, %rdx
    809 ; AVX-NEXT:    movq %rdi, %rsi
    810 ; AVX-NEXT:    andq $8192, %rsi # imm = 0x2000
    811 ; AVX-NEXT:    shlq $37, %rsi
    812 ; AVX-NEXT:    orq %rdx, %rsi
    813 ; AVX-NEXT:    movq %rdi, %rdx
    814 ; AVX-NEXT:    andq $16384, %rdx # imm = 0x4000
    815 ; AVX-NEXT:    shlq $35, %rdx
    816 ; AVX-NEXT:    orq %rsi, %rdx
    817 ; AVX-NEXT:    movq %rdi, %rsi
    818 ; AVX-NEXT:    andq $32768, %rsi # imm = 0x8000
    819 ; AVX-NEXT:    shlq $33, %rsi
    820 ; AVX-NEXT:    orq %rdx, %rsi
    821 ; AVX-NEXT:    movq %rdi, %rdx
    822 ; AVX-NEXT:    andq $65536, %rdx # imm = 0x10000
    823 ; AVX-NEXT:    shlq $31, %rdx
    824 ; AVX-NEXT:    orq %rsi, %rdx
    825 ; AVX-NEXT:    movq %rdi, %rsi
    826 ; AVX-NEXT:    andq $131072, %rsi # imm = 0x20000
    827 ; AVX-NEXT:    shlq $29, %rsi
    828 ; AVX-NEXT:    orq %rdx, %rsi
    829 ; AVX-NEXT:    movq %rdi, %rdx
    830 ; AVX-NEXT:    andq $262144, %rdx # imm = 0x40000
    831 ; AVX-NEXT:    shlq $27, %rdx
    832 ; AVX-NEXT:    orq %rsi, %rdx
    833 ; AVX-NEXT:    movq %rdi, %rsi
    834 ; AVX-NEXT:    andq $524288, %rsi # imm = 0x80000
    835 ; AVX-NEXT:    shlq $25, %rsi
    836 ; AVX-NEXT:    orq %rdx, %rsi
    837 ; AVX-NEXT:    movq %rdi, %rdx
    838 ; AVX-NEXT:    andq $1048576, %rdx # imm = 0x100000
    839 ; AVX-NEXT:    shlq $23, %rdx
    840 ; AVX-NEXT:    orq %rsi, %rdx
    841 ; AVX-NEXT:    movq %rdi, %rsi
    842 ; AVX-NEXT:    andq $2097152, %rsi # imm = 0x200000
    843 ; AVX-NEXT:    shlq $21, %rsi
    844 ; AVX-NEXT:    orq %rdx, %rsi
    845 ; AVX-NEXT:    movq %rdi, %rdx
    846 ; AVX-NEXT:    andq $4194304, %rdx # imm = 0x400000
    847 ; AVX-NEXT:    shlq $19, %rdx
    848 ; AVX-NEXT:    orq %rsi, %rdx
    849 ; AVX-NEXT:    movq %rdi, %rsi
    850 ; AVX-NEXT:    andq $8388608, %rsi # imm = 0x800000
    851 ; AVX-NEXT:    shlq $17, %rsi
    852 ; AVX-NEXT:    orq %rdx, %rsi
    853 ; AVX-NEXT:    movq %rdi, %rdx
    854 ; AVX-NEXT:    andq $16777216, %rdx # imm = 0x1000000
    855 ; AVX-NEXT:    shlq $15, %rdx
    856 ; AVX-NEXT:    orq %rsi, %rdx
    857 ; AVX-NEXT:    movq %rdi, %rsi
    858 ; AVX-NEXT:    andq $33554432, %rsi # imm = 0x2000000
    859 ; AVX-NEXT:    shlq $13, %rsi
    860 ; AVX-NEXT:    orq %rdx, %rsi
    861 ; AVX-NEXT:    movq %rdi, %rdx
    862 ; AVX-NEXT:    andq $67108864, %rdx # imm = 0x4000000
    863 ; AVX-NEXT:    shlq $11, %rdx
    864 ; AVX-NEXT:    orq %rsi, %rdx
    865 ; AVX-NEXT:    movq %rdi, %rsi
    866 ; AVX-NEXT:    andq $134217728, %rsi # imm = 0x8000000
    867 ; AVX-NEXT:    shlq $9, %rsi
    868 ; AVX-NEXT:    orq %rdx, %rsi
    869 ; AVX-NEXT:    movq %rdi, %rdx
    870 ; AVX-NEXT:    andq $268435456, %rdx # imm = 0x10000000
    871 ; AVX-NEXT:    shlq $7, %rdx
    872 ; AVX-NEXT:    orq %rsi, %rdx
    873 ; AVX-NEXT:    movq %rdi, %rsi
    874 ; AVX-NEXT:    andq $536870912, %rsi # imm = 0x20000000
    875 ; AVX-NEXT:    shlq $5, %rsi
    876 ; AVX-NEXT:    orq %rdx, %rsi
    877 ; AVX-NEXT:    movq %rdi, %rdx
    878 ; AVX-NEXT:    andq $1073741824, %rdx # imm = 0x40000000
    879 ; AVX-NEXT:    shlq $3, %rdx
    880 ; AVX-NEXT:    orq %rsi, %rdx
    881 ; AVX-NEXT:    orq %rcx, %rdx
    882 ; AVX-NEXT:    movq %rdi, %rcx
    883 ; AVX-NEXT:    shrq %rcx
    884 ; AVX-NEXT:    andl $-2147483648, %ecx # imm = 0x80000000
    885 ; AVX-NEXT:    orq %rdx, %rcx
    886 ; AVX-NEXT:    movq %rdi, %rdx
    887 ; AVX-NEXT:    shrq $3, %rdx
    888 ; AVX-NEXT:    andl $1073741824, %edx # imm = 0x40000000
    889 ; AVX-NEXT:    orq %rcx, %rdx
    890 ; AVX-NEXT:    movq %rdi, %rcx
    891 ; AVX-NEXT:    shrq $5, %rcx
    892 ; AVX-NEXT:    andl $536870912, %ecx # imm = 0x20000000
    893 ; AVX-NEXT:    orq %rdx, %rcx
    894 ; AVX-NEXT:    movq %rdi, %rdx
    895 ; AVX-NEXT:    shrq $7, %rdx
    896 ; AVX-NEXT:    andl $268435456, %edx # imm = 0x10000000
    897 ; AVX-NEXT:    orq %rcx, %rdx
    898 ; AVX-NEXT:    movq %rdi, %rcx
    899 ; AVX-NEXT:    shrq $9, %rcx
    900 ; AVX-NEXT:    andl $134217728, %ecx # imm = 0x8000000
    901 ; AVX-NEXT:    orq %rdx, %rcx
    902 ; AVX-NEXT:    movq %rdi, %rdx
    903 ; AVX-NEXT:    shrq $11, %rdx
    904 ; AVX-NEXT:    andl $67108864, %edx # imm = 0x4000000
    905 ; AVX-NEXT:    orq %rcx, %rdx
    906 ; AVX-NEXT:    movq %rdi, %rcx
    907 ; AVX-NEXT:    shrq $13, %rcx
    908 ; AVX-NEXT:    andl $33554432, %ecx # imm = 0x2000000
    909 ; AVX-NEXT:    orq %rdx, %rcx
    910 ; AVX-NEXT:    movq %rdi, %rdx
    911 ; AVX-NEXT:    shrq $15, %rdx
    912 ; AVX-NEXT:    andl $16777216, %edx # imm = 0x1000000
    913 ; AVX-NEXT:    orq %rcx, %rdx
    914 ; AVX-NEXT:    movq %rdi, %rcx
    915 ; AVX-NEXT:    shrq $17, %rcx
    916 ; AVX-NEXT:    andl $8388608, %ecx # imm = 0x800000
    917 ; AVX-NEXT:    orq %rdx, %rcx
    918 ; AVX-NEXT:    movq %rdi, %rdx
    919 ; AVX-NEXT:    shrq $19, %rdx
    920 ; AVX-NEXT:    andl $4194304, %edx # imm = 0x400000
    921 ; AVX-NEXT:    orq %rcx, %rdx
    922 ; AVX-NEXT:    movq %rdi, %rcx
    923 ; AVX-NEXT:    shrq $21, %rcx
    924 ; AVX-NEXT:    andl $2097152, %ecx # imm = 0x200000
    925 ; AVX-NEXT:    orq %rdx, %rcx
    926 ; AVX-NEXT:    movq %rdi, %rdx
    927 ; AVX-NEXT:    shrq $23, %rdx
    928 ; AVX-NEXT:    andl $1048576, %edx # imm = 0x100000
    929 ; AVX-NEXT:    orq %rcx, %rdx
    930 ; AVX-NEXT:    movq %rdi, %rcx
    931 ; AVX-NEXT:    shrq $25, %rcx
    932 ; AVX-NEXT:    andl $524288, %ecx # imm = 0x80000
    933 ; AVX-NEXT:    orq %rdx, %rcx
    934 ; AVX-NEXT:    movq %rdi, %rdx
    935 ; AVX-NEXT:    shrq $27, %rdx
    936 ; AVX-NEXT:    andl $262144, %edx # imm = 0x40000
    937 ; AVX-NEXT:    orq %rcx, %rdx
    938 ; AVX-NEXT:    movq %rdi, %rcx
    939 ; AVX-NEXT:    shrq $29, %rcx
    940 ; AVX-NEXT:    andl $131072, %ecx # imm = 0x20000
    941 ; AVX-NEXT:    orq %rdx, %rcx
    942 ; AVX-NEXT:    movq %rdi, %rdx
    943 ; AVX-NEXT:    shrq $31, %rdx
    944 ; AVX-NEXT:    andl $65536, %edx # imm = 0x10000
    945 ; AVX-NEXT:    orq %rcx, %rdx
    946 ; AVX-NEXT:    movq %rdi, %rcx
    947 ; AVX-NEXT:    shrq $33, %rcx
    948 ; AVX-NEXT:    andl $32768, %ecx # imm = 0x8000
    949 ; AVX-NEXT:    orq %rdx, %rcx
    950 ; AVX-NEXT:    movq %rdi, %rdx
    951 ; AVX-NEXT:    shrq $35, %rdx
    952 ; AVX-NEXT:    andl $16384, %edx # imm = 0x4000
    953 ; AVX-NEXT:    orq %rcx, %rdx
    954 ; AVX-NEXT:    movq %rdi, %rcx
    955 ; AVX-NEXT:    shrq $37, %rcx
    956 ; AVX-NEXT:    andl $8192, %ecx # imm = 0x2000
    957 ; AVX-NEXT:    orq %rdx, %rcx
    958 ; AVX-NEXT:    movq %rdi, %rdx
    959 ; AVX-NEXT:    shrq $39, %rdx
    960 ; AVX-NEXT:    andl $4096, %edx # imm = 0x1000
    961 ; AVX-NEXT:    orq %rcx, %rdx
    962 ; AVX-NEXT:    movq %rdi, %rcx
    963 ; AVX-NEXT:    shrq $41, %rcx
    964 ; AVX-NEXT:    andl $2048, %ecx # imm = 0x800
    965 ; AVX-NEXT:    orq %rdx, %rcx
    966 ; AVX-NEXT:    movq %rdi, %rdx
    967 ; AVX-NEXT:    shrq $43, %rdx
    968 ; AVX-NEXT:    andl $1024, %edx # imm = 0x400
    969 ; AVX-NEXT:    orq %rcx, %rdx
    970 ; AVX-NEXT:    movq %rdi, %rcx
    971 ; AVX-NEXT:    shrq $45, %rcx
    972 ; AVX-NEXT:    andl $512, %ecx # imm = 0x200
    973 ; AVX-NEXT:    orq %rdx, %rcx
    974 ; AVX-NEXT:    movq %rdi, %rdx
    975 ; AVX-NEXT:    shrq $47, %rdx
    976 ; AVX-NEXT:    andl $256, %edx # imm = 0x100
    977 ; AVX-NEXT:    orq %rcx, %rdx
    978 ; AVX-NEXT:    movq %rdi, %rcx
    979 ; AVX-NEXT:    shrq $49, %rcx
    980 ; AVX-NEXT:    andl $128, %ecx
    981 ; AVX-NEXT:    orq %rdx, %rcx
    982 ; AVX-NEXT:    movq %rdi, %rdx
    983 ; AVX-NEXT:    shrq $51, %rdx
    984 ; AVX-NEXT:    andl $64, %edx
    985 ; AVX-NEXT:    orq %rcx, %rdx
    986 ; AVX-NEXT:    movq %rdi, %rcx
    987 ; AVX-NEXT:    shrq $53, %rcx
    988 ; AVX-NEXT:    andl $32, %ecx
    989 ; AVX-NEXT:    orq %rdx, %rcx
    990 ; AVX-NEXT:    movq %rdi, %rdx
    991 ; AVX-NEXT:    shrq $55, %rdx
    992 ; AVX-NEXT:    andl $16, %edx
    993 ; AVX-NEXT:    orq %rcx, %rdx
    994 ; AVX-NEXT:    movq %rdi, %rcx
    995 ; AVX-NEXT:    shrq $57, %rcx
    996 ; AVX-NEXT:    andl $8, %ecx
    997 ; AVX-NEXT:    orq %rdx, %rcx
    998 ; AVX-NEXT:    movq %rdi, %rdx
    999 ; AVX-NEXT:    shrq $59, %rdx
   1000 ; AVX-NEXT:    andl $4, %edx
   1001 ; AVX-NEXT:    orq %rcx, %rdx
   1002 ; AVX-NEXT:    movq %rdi, %rcx
   1003 ; AVX-NEXT:    shrq $61, %rcx
   1004 ; AVX-NEXT:    andl $2, %ecx
   1005 ; AVX-NEXT:    orq %rdx, %rcx
   1006 ; AVX-NEXT:    shrq $63, %rdi
   1007 ; AVX-NEXT:    orq %rcx, %rdi
   1008 ; AVX-NEXT:    orq %rdi, %rax
   1009 ; AVX-NEXT:    retq
   1010 ;
   1011 ; XOP-LABEL: test_bitreverse_i64:
   1012 ; XOP:       # BB#0:
   1013 ; XOP-NEXT:    vmovq %rdi, %xmm0
   1014 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
   1015 ; XOP-NEXT:    vmovq %xmm0, %rax
   1016 ; XOP-NEXT:    retq
   1017   %b = call i64 @llvm.bitreverse.i64(i64 %a)
   1018   ret i64 %b
   1019 }
   1020 
   1021 define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
   1022 ; SSE2-LABEL: test_bitreverse_v16i8:
   1023 ; SSE2:       # BB#0:
   1024 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1025 ; SSE2-NEXT:    psrlw $7, %xmm2
   1026 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   1027 ; SSE2-NEXT:    pand %xmm1, %xmm1
   1028 ; SSE2-NEXT:    pand %xmm2, %xmm1
   1029 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1030 ; SSE2-NEXT:    psllw $7, %xmm2
   1031 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1032 ; SSE2-NEXT:    pand %xmm3, %xmm3
   1033 ; SSE2-NEXT:    pand %xmm3, %xmm2
   1034 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1035 ; SSE2-NEXT:    psllw $5, %xmm3
   1036 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1037 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1038 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1039 ; SSE2-NEXT:    psllw $3, %xmm4
   1040 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1041 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1042 ; SSE2-NEXT:    por %xmm3, %xmm4
   1043 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1044 ; SSE2-NEXT:    paddb %xmm3, %xmm3
   1045 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1046 ; SSE2-NEXT:    por %xmm4, %xmm3
   1047 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1048 ; SSE2-NEXT:    psrlw $1, %xmm4
   1049 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1050 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
   1051 ; SSE2-NEXT:    por %xmm3, %xmm4
   1052 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1053 ; SSE2-NEXT:    psrlw $3, %xmm3
   1054 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1055 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1056 ; SSE2-NEXT:    por %xmm4, %xmm3
   1057 ; SSE2-NEXT:    psrlw $5, %xmm0
   1058 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1059 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1060 ; SSE2-NEXT:    por %xmm3, %xmm0
   1061 ; SSE2-NEXT:    por %xmm1, %xmm0
   1062 ; SSE2-NEXT:    por %xmm2, %xmm0
   1063 ; SSE2-NEXT:    retq
   1064 ;
   1065 ; SSSE3-LABEL: test_bitreverse_v16i8:
   1066 ; SSSE3:       # BB#0:
   1067 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1068 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1069 ; SSSE3-NEXT:    pand %xmm1, %xmm2
   1070 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1071 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
   1072 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1073 ; SSSE3-NEXT:    pand %xmm1, %xmm0
   1074 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1075 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
   1076 ; SSSE3-NEXT:    por %xmm3, %xmm1
   1077 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1078 ; SSSE3-NEXT:    retq
   1079 ;
   1080 ; AVX-LABEL: test_bitreverse_v16i8:
   1081 ; AVX:       # BB#0:
   1082 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1083 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1084 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1085 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1086 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1087 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1088 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1089 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
   1090 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1091 ; AVX-NEXT:    retq
   1092 ;
   1093 ; XOP-LABEL: test_bitreverse_v16i8:
   1094 ; XOP:       # BB#0:
   1095 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
   1096 ; XOP-NEXT:    retq
   1097   %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
   1098   ret <16 x i8> %b
   1099 }
   1100 
   1101 define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
   1102 ; SSE2-LABEL: test_bitreverse_v8i16:
   1103 ; SSE2:       # BB#0:
   1104 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1105 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1106 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
   1107 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
   1108 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
   1109 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1110 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
   1111 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6]
   1112 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   1113 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1114 ; SSE2-NEXT:    psllw $7, %xmm0
   1115 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1116 ; SSE2-NEXT:    pand %xmm2, %xmm2
   1117 ; SSE2-NEXT:    pand %xmm0, %xmm2
   1118 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1119 ; SSE2-NEXT:    psllw $5, %xmm0
   1120 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1121 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1122 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1123 ; SSE2-NEXT:    psllw $3, %xmm3
   1124 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1125 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1126 ; SSE2-NEXT:    por %xmm0, %xmm3
   1127 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1128 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   1129 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1130 ; SSE2-NEXT:    por %xmm3, %xmm0
   1131 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1132 ; SSE2-NEXT:    psrlw $1, %xmm3
   1133 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1134 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1135 ; SSE2-NEXT:    por %xmm0, %xmm3
   1136 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1137 ; SSE2-NEXT:    psrlw $3, %xmm0
   1138 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1139 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1140 ; SSE2-NEXT:    por %xmm3, %xmm0
   1141 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1142 ; SSE2-NEXT:    psrlw $5, %xmm3
   1143 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1144 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1145 ; SSE2-NEXT:    por %xmm0, %xmm3
   1146 ; SSE2-NEXT:    psrlw $7, %xmm1
   1147 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   1148 ; SSE2-NEXT:    pand %xmm0, %xmm0
   1149 ; SSE2-NEXT:    pand %xmm1, %xmm0
   1150 ; SSE2-NEXT:    por %xmm3, %xmm0
   1151 ; SSE2-NEXT:    por %xmm2, %xmm0
   1152 ; SSE2-NEXT:    retq
   1153 ;
   1154 ; SSSE3-LABEL: test_bitreverse_v8i16:
   1155 ; SSSE3:       # BB#0:
   1156 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1157 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1158 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1159 ; SSSE3-NEXT:    pand %xmm1, %xmm2
   1160 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1161 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
   1162 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1163 ; SSSE3-NEXT:    pand %xmm1, %xmm0
   1164 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1165 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
   1166 ; SSSE3-NEXT:    por %xmm3, %xmm1
   1167 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1168 ; SSSE3-NEXT:    retq
   1169 ;
   1170 ; AVX-LABEL: test_bitreverse_v8i16:
   1171 ; AVX:       # BB#0:
   1172 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1173 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1174 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1175 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1176 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1177 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1178 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1179 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1180 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
   1181 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1182 ; AVX-NEXT:    retq
   1183 ;
   1184 ; XOP-LABEL: test_bitreverse_v8i16:
   1185 ; XOP:       # BB#0:
   1186 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
   1187 ; XOP-NEXT:    retq
   1188   %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
   1189   ret <8 x i16> %b
   1190 }
   1191 
   1192 define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
   1193 ; SSE2-LABEL: test_bitreverse_v4i32:
   1194 ; SSE2:       # BB#0:
   1195 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1196 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1197 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
   1198 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   1199 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   1200 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1201 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   1202 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
   1203 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   1204 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1205 ; SSE2-NEXT:    psllw $7, %xmm0
   1206 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1207 ; SSE2-NEXT:    pand %xmm2, %xmm2
   1208 ; SSE2-NEXT:    pand %xmm0, %xmm2
   1209 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1210 ; SSE2-NEXT:    psllw $5, %xmm0
   1211 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1212 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1213 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1214 ; SSE2-NEXT:    psllw $3, %xmm3
   1215 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1216 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1217 ; SSE2-NEXT:    por %xmm0, %xmm3
   1218 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1219 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   1220 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1221 ; SSE2-NEXT:    por %xmm3, %xmm0
   1222 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1223 ; SSE2-NEXT:    psrlw $1, %xmm3
   1224 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1225 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1226 ; SSE2-NEXT:    por %xmm0, %xmm3
   1227 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1228 ; SSE2-NEXT:    psrlw $3, %xmm0
   1229 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1230 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1231 ; SSE2-NEXT:    por %xmm3, %xmm0
   1232 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1233 ; SSE2-NEXT:    psrlw $5, %xmm3
   1234 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1235 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1236 ; SSE2-NEXT:    por %xmm0, %xmm3
   1237 ; SSE2-NEXT:    psrlw $7, %xmm1
   1238 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   1239 ; SSE2-NEXT:    pand %xmm0, %xmm0
   1240 ; SSE2-NEXT:    pand %xmm1, %xmm0
   1241 ; SSE2-NEXT:    por %xmm3, %xmm0
   1242 ; SSE2-NEXT:    por %xmm2, %xmm0
   1243 ; SSE2-NEXT:    retq
   1244 ;
   1245 ; SSSE3-LABEL: test_bitreverse_v4i32:
   1246 ; SSSE3:       # BB#0:
   1247 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   1248 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1249 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1250 ; SSSE3-NEXT:    pand %xmm1, %xmm2
   1251 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1252 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
   1253 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1254 ; SSSE3-NEXT:    pand %xmm1, %xmm0
   1255 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1256 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
   1257 ; SSSE3-NEXT:    por %xmm3, %xmm1
   1258 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1259 ; SSSE3-NEXT:    retq
   1260 ;
   1261 ; AVX-LABEL: test_bitreverse_v4i32:
   1262 ; AVX:       # BB#0:
   1263 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   1264 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1265 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1266 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1267 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1268 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1269 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1270 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1271 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
   1272 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1273 ; AVX-NEXT:    retq
   1274 ;
   1275 ; XOP-LABEL: test_bitreverse_v4i32:
   1276 ; XOP:       # BB#0:
   1277 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
   1278 ; XOP-NEXT:    retq
   1279   %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
   1280   ret <4 x i32> %b
   1281 }
   1282 
   1283 define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
   1284 ; SSE2-LABEL: test_bitreverse_v2i64:
   1285 ; SSE2:       # BB#0:
   1286 ; SSE2-NEXT:    pxor %xmm1, %xmm1
   1287 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1288 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
   1289 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   1290 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   1291 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   1292 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1293 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1294 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   1295 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
   1296 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   1297 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1298 ; SSE2-NEXT:    psllw $7, %xmm0
   1299 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1300 ; SSE2-NEXT:    pand %xmm2, %xmm2
   1301 ; SSE2-NEXT:    pand %xmm0, %xmm2
   1302 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1303 ; SSE2-NEXT:    psllw $5, %xmm0
   1304 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1305 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1306 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1307 ; SSE2-NEXT:    psllw $3, %xmm3
   1308 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1309 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1310 ; SSE2-NEXT:    por %xmm0, %xmm3
   1311 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1312 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   1313 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1314 ; SSE2-NEXT:    por %xmm3, %xmm0
   1315 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1316 ; SSE2-NEXT:    psrlw $1, %xmm3
   1317 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1318 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1319 ; SSE2-NEXT:    por %xmm0, %xmm3
   1320 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1321 ; SSE2-NEXT:    psrlw $3, %xmm0
   1322 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1323 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
   1324 ; SSE2-NEXT:    por %xmm3, %xmm0
   1325 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1326 ; SSE2-NEXT:    psrlw $5, %xmm3
   1327 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1328 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
   1329 ; SSE2-NEXT:    por %xmm0, %xmm3
   1330 ; SSE2-NEXT:    psrlw $7, %xmm1
   1331 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   1332 ; SSE2-NEXT:    pand %xmm0, %xmm0
   1333 ; SSE2-NEXT:    pand %xmm1, %xmm0
   1334 ; SSE2-NEXT:    por %xmm3, %xmm0
   1335 ; SSE2-NEXT:    por %xmm2, %xmm0
   1336 ; SSE2-NEXT:    retq
   1337 ;
   1338 ; SSSE3-LABEL: test_bitreverse_v2i64:
   1339 ; SSSE3:       # BB#0:
   1340 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   1341 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1342 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1343 ; SSSE3-NEXT:    pand %xmm1, %xmm2
   1344 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1345 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
   1346 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1347 ; SSSE3-NEXT:    pand %xmm1, %xmm0
   1348 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1349 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
   1350 ; SSSE3-NEXT:    por %xmm3, %xmm1
   1351 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1352 ; SSSE3-NEXT:    retq
   1353 ;
   1354 ; AVX-LABEL: test_bitreverse_v2i64:
   1355 ; AVX:       # BB#0:
   1356 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   1357 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1358 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
   1359 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1360 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
   1361 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1362 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1363 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1364 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
   1365 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1366 ; AVX-NEXT:    retq
   1367 ;
   1368 ; XOP-LABEL: test_bitreverse_v2i64:
   1369 ; XOP:       # BB#0:
   1370 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
   1371 ; XOP-NEXT:    retq
   1372   %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
   1373   ret <2 x i64> %b
   1374 }
   1375 
   1376 define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
   1377 ; SSE2-LABEL: test_bitreverse_v32i8:
   1378 ; SSE2:       # BB#0:
   1379 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1380 ; SSE2-NEXT:    psllw $5, %xmm2
   1381 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   1382 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm9
   1383 ; SSE2-NEXT:    pand %xmm9, %xmm2
   1384 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1385 ; SSE2-NEXT:    psllw $7, %xmm5
   1386 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1387 ; SSE2-NEXT:    pand %xmm10, %xmm10
   1388 ; SSE2-NEXT:    pand %xmm10, %xmm5
   1389 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1390 ; SSE2-NEXT:    psllw $3, %xmm3
   1391 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   1392 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm11
   1393 ; SSE2-NEXT:    pand %xmm11, %xmm3
   1394 ; SSE2-NEXT:    por %xmm2, %xmm3
   1395 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1396 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1397 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   1398 ; SSE2-NEXT:    pand %xmm8, %xmm2
   1399 ; SSE2-NEXT:    por %xmm3, %xmm2
   1400 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1401 ; SSE2-NEXT:    psrlw $1, %xmm3
   1402 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   1403 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   1404 ; SSE2-NEXT:    pand %xmm12, %xmm3
   1405 ; SSE2-NEXT:    por %xmm2, %xmm3
   1406 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1407 ; SSE2-NEXT:    psrlw $3, %xmm4
   1408 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   1409 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
   1410 ; SSE2-NEXT:    pand %xmm6, %xmm4
   1411 ; SSE2-NEXT:    por %xmm3, %xmm4
   1412 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1413 ; SSE2-NEXT:    psrlw $5, %xmm7
   1414 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   1415 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
   1416 ; SSE2-NEXT:    pand %xmm2, %xmm7
   1417 ; SSE2-NEXT:    por %xmm4, %xmm7
   1418 ; SSE2-NEXT:    psrlw $7, %xmm0
   1419 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   1420 ; SSE2-NEXT:    pand %xmm3, %xmm3
   1421 ; SSE2-NEXT:    pand %xmm3, %xmm0
   1422 ; SSE2-NEXT:    por %xmm7, %xmm0
   1423 ; SSE2-NEXT:    por %xmm5, %xmm0
   1424 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1425 ; SSE2-NEXT:    psllw $5, %xmm4
   1426 ; SSE2-NEXT:    pand %xmm9, %xmm4
   1427 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1428 ; SSE2-NEXT:    psllw $7, %xmm5
   1429 ; SSE2-NEXT:    pand %xmm10, %xmm5
   1430 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   1431 ; SSE2-NEXT:    psllw $3, %xmm7
   1432 ; SSE2-NEXT:    pand %xmm11, %xmm7
   1433 ; SSE2-NEXT:    por %xmm4, %xmm7
   1434 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1435 ; SSE2-NEXT:    paddb %xmm4, %xmm4
   1436 ; SSE2-NEXT:    pand %xmm8, %xmm4
   1437 ; SSE2-NEXT:    por %xmm7, %xmm4
   1438 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   1439 ; SSE2-NEXT:    psrlw $1, %xmm7
   1440 ; SSE2-NEXT:    pand %xmm12, %xmm7
   1441 ; SSE2-NEXT:    por %xmm4, %xmm7
   1442 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1443 ; SSE2-NEXT:    psrlw $3, %xmm4
   1444 ; SSE2-NEXT:    pand %xmm6, %xmm4
   1445 ; SSE2-NEXT:    por %xmm7, %xmm4
   1446 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1447 ; SSE2-NEXT:    psrlw $5, %xmm6
   1448 ; SSE2-NEXT:    pand %xmm2, %xmm6
   1449 ; SSE2-NEXT:    por %xmm4, %xmm6
   1450 ; SSE2-NEXT:    psrlw $7, %xmm1
   1451 ; SSE2-NEXT:    pand %xmm3, %xmm1
   1452 ; SSE2-NEXT:    por %xmm6, %xmm1
   1453 ; SSE2-NEXT:    por %xmm5, %xmm1
   1454 ; SSE2-NEXT:    retq
   1455 ;
   1456 ; SSSE3-LABEL: test_bitreverse_v32i8:
   1457 ; SSSE3:       # BB#0:
   1458 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1459 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1460 ; SSSE3-NEXT:    pand %xmm4, %xmm2
   1461 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1462 ; SSSE3-NEXT:    movdqa %xmm5, %xmm6
   1463 ; SSSE3-NEXT:    pshufb %xmm2, %xmm6
   1464 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1465 ; SSSE3-NEXT:    pand %xmm4, %xmm0
   1466 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1467 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
   1468 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
   1469 ; SSSE3-NEXT:    por %xmm6, %xmm3
   1470 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1471 ; SSSE3-NEXT:    pand %xmm4, %xmm0
   1472 ; SSSE3-NEXT:    pshufb %xmm0, %xmm5
   1473 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1474 ; SSSE3-NEXT:    pand %xmm4, %xmm1
   1475 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
   1476 ; SSSE3-NEXT:    por %xmm5, %xmm2
   1477 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
   1478 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
   1479 ; SSSE3-NEXT:    retq
   1480 ;
   1481 ; AVX1-LABEL: test_bitreverse_v32i8:
   1482 ; AVX1:       # BB#0:
   1483 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1484 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1485 ; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
   1486 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1487 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
   1488 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1489 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
   1490 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1491 ; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
   1492 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
   1493 ; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
   1494 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
   1495 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1496 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   1497 ; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
   1498 ; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
   1499 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1500 ; AVX1-NEXT:    retq
   1501 ;
   1502 ; AVX2-LABEL: test_bitreverse_v32i8:
   1503 ; AVX2:       # BB#0:
   1504 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1505 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1506 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1507 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1508 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1509 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1510 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1511 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1512 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1513 ; AVX2-NEXT:    retq
   1514 ;
   1515 ; AVX512-LABEL: test_bitreverse_v32i8:
   1516 ; AVX512:       # BB#0:
   1517 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1518 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1519 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1520 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1521 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1522 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1523 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1524 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1525 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1526 ; AVX512-NEXT:    retq
   1527 ;
   1528 ; XOPAVX1-LABEL: test_bitreverse_v32i8:
   1529 ; XOPAVX1:       # BB#0:
   1530 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1531 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
   1532 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1533 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1534 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1535 ; XOPAVX1-NEXT:    retq
   1536 ;
   1537 ; XOPAVX2-LABEL: test_bitreverse_v32i8:
   1538 ; XOPAVX2:       # BB#0:
   1539 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1540 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
   1541 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1542 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1543 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1544 ; XOPAVX2-NEXT:    retq
   1545   %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
   1546   ret <32 x i8> %b
   1547 }
   1548 
   1549 define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
   1550 ; SSE2-LABEL: test_bitreverse_v16i16:
   1551 ; SSE2:       # BB#0:
   1552 ; SSE2-NEXT:    pxor %xmm9, %xmm9
   1553 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1554 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
   1555 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
   1556 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
   1557 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
   1558 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
   1559 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
   1560 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
   1561 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1562 ; SSE2-NEXT:    psllw $5, %xmm2
   1563 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   1564 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
   1565 ; SSE2-NEXT:    pand %xmm10, %xmm2
   1566 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1567 ; SSE2-NEXT:    psllw $7, %xmm3
   1568 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1569 ; SSE2-NEXT:    pand %xmm11, %xmm11
   1570 ; SSE2-NEXT:    pand %xmm11, %xmm3
   1571 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1572 ; SSE2-NEXT:    psllw $3, %xmm4
   1573 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   1574 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   1575 ; SSE2-NEXT:    pand %xmm12, %xmm4
   1576 ; SSE2-NEXT:    por %xmm2, %xmm4
   1577 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1578 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1579 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   1580 ; SSE2-NEXT:    pand %xmm8, %xmm2
   1581 ; SSE2-NEXT:    por %xmm4, %xmm2
   1582 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1583 ; SSE2-NEXT:    psrlw $1, %xmm4
   1584 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   1585 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
   1586 ; SSE2-NEXT:    pand %xmm13, %xmm4
   1587 ; SSE2-NEXT:    por %xmm2, %xmm4
   1588 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1589 ; SSE2-NEXT:    psrlw $3, %xmm5
   1590 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   1591 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
   1592 ; SSE2-NEXT:    pand %xmm6, %xmm5
   1593 ; SSE2-NEXT:    por %xmm4, %xmm5
   1594 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1595 ; SSE2-NEXT:    psrlw $5, %xmm7
   1596 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   1597 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
   1598 ; SSE2-NEXT:    pand %xmm2, %xmm7
   1599 ; SSE2-NEXT:    por %xmm5, %xmm7
   1600 ; SSE2-NEXT:    psrlw $7, %xmm0
   1601 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   1602 ; SSE2-NEXT:    pand %xmm4, %xmm4
   1603 ; SSE2-NEXT:    pand %xmm4, %xmm0
   1604 ; SSE2-NEXT:    por %xmm7, %xmm0
   1605 ; SSE2-NEXT:    por %xmm3, %xmm0
   1606 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1607 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
   1608 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
   1609 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
   1610 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
   1611 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
   1612 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
   1613 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
   1614 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1615 ; SSE2-NEXT:    psllw $5, %xmm5
   1616 ; SSE2-NEXT:    pand %xmm10, %xmm5
   1617 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1618 ; SSE2-NEXT:    psllw $7, %xmm3
   1619 ; SSE2-NEXT:    pand %xmm11, %xmm3
   1620 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   1621 ; SSE2-NEXT:    psllw $3, %xmm7
   1622 ; SSE2-NEXT:    pand %xmm12, %xmm7
   1623 ; SSE2-NEXT:    por %xmm5, %xmm7
   1624 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1625 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   1626 ; SSE2-NEXT:    pand %xmm8, %xmm5
   1627 ; SSE2-NEXT:    por %xmm7, %xmm5
   1628 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   1629 ; SSE2-NEXT:    psrlw $1, %xmm7
   1630 ; SSE2-NEXT:    pand %xmm13, %xmm7
   1631 ; SSE2-NEXT:    por %xmm5, %xmm7
   1632 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1633 ; SSE2-NEXT:    psrlw $3, %xmm5
   1634 ; SSE2-NEXT:    pand %xmm6, %xmm5
   1635 ; SSE2-NEXT:    por %xmm7, %xmm5
   1636 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1637 ; SSE2-NEXT:    psrlw $5, %xmm6
   1638 ; SSE2-NEXT:    pand %xmm2, %xmm6
   1639 ; SSE2-NEXT:    por %xmm5, %xmm6
   1640 ; SSE2-NEXT:    psrlw $7, %xmm1
   1641 ; SSE2-NEXT:    pand %xmm4, %xmm1
   1642 ; SSE2-NEXT:    por %xmm6, %xmm1
   1643 ; SSE2-NEXT:    por %xmm3, %xmm1
   1644 ; SSE2-NEXT:    retq
   1645 ;
   1646 ; SSSE3-LABEL: test_bitreverse_v16i16:
   1647 ; SSSE3:       # BB#0:
   1648 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1649 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
   1650 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1651 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1652 ; SSSE3-NEXT:    pand %xmm5, %xmm2
   1653 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1654 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
   1655 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   1656 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1657 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   1658 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1659 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
   1660 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
   1661 ; SSSE3-NEXT:    por %xmm7, %xmm3
   1662 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
   1663 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1664 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   1665 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   1666 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1667 ; SSSE3-NEXT:    pand %xmm5, %xmm1
   1668 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
   1669 ; SSSE3-NEXT:    por %xmm6, %xmm2
   1670 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
   1671 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
   1672 ; SSSE3-NEXT:    retq
   1673 ;
   1674 ; AVX1-LABEL: test_bitreverse_v16i16:
   1675 ; AVX1:       # BB#0:
   1676 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1677 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1678 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1679 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1680 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
   1681 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1682 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   1683 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1684 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   1685 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1686 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   1687 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
   1688 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1689 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
   1690 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
   1691 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1692 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   1693 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   1694 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1695 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1696 ; AVX1-NEXT:    retq
   1697 ;
   1698 ; AVX2-LABEL: test_bitreverse_v16i16:
   1699 ; AVX2:       # BB#0:
   1700 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
   1701 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1702 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1703 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1704 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1705 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1706 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1707 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1708 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1709 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1710 ; AVX2-NEXT:    retq
   1711 ;
   1712 ; AVX512-LABEL: test_bitreverse_v16i16:
   1713 ; AVX512:       # BB#0:
   1714 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
   1715 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1716 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1717 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1718 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1719 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1720 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1721 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1722 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1723 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1724 ; AVX512-NEXT:    retq
   1725 ;
   1726 ; XOPAVX1-LABEL: test_bitreverse_v16i16:
   1727 ; XOPAVX1:       # BB#0:
   1728 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1729 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
   1730 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1731 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1732 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1733 ; XOPAVX1-NEXT:    retq
   1734 ;
   1735 ; XOPAVX2-LABEL: test_bitreverse_v16i16:
   1736 ; XOPAVX2:       # BB#0:
   1737 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1738 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
   1739 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1740 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1741 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1742 ; XOPAVX2-NEXT:    retq
   1743   %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
   1744   ret <16 x i16> %b
   1745 }
   1746 
   1747 define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
   1748 ; SSE2-LABEL: test_bitreverse_v8i32:
   1749 ; SSE2:       # BB#0:
   1750 ; SSE2-NEXT:    pxor %xmm9, %xmm9
   1751 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1752 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
   1753 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   1754 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   1755 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
   1756 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   1757 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
   1758 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
   1759 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1760 ; SSE2-NEXT:    psllw $5, %xmm2
   1761 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   1762 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
   1763 ; SSE2-NEXT:    pand %xmm10, %xmm2
   1764 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1765 ; SSE2-NEXT:    psllw $7, %xmm3
   1766 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1767 ; SSE2-NEXT:    pand %xmm11, %xmm11
   1768 ; SSE2-NEXT:    pand %xmm11, %xmm3
   1769 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1770 ; SSE2-NEXT:    psllw $3, %xmm4
   1771 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   1772 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   1773 ; SSE2-NEXT:    pand %xmm12, %xmm4
   1774 ; SSE2-NEXT:    por %xmm2, %xmm4
   1775 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1776 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1777 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   1778 ; SSE2-NEXT:    pand %xmm8, %xmm2
   1779 ; SSE2-NEXT:    por %xmm4, %xmm2
   1780 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1781 ; SSE2-NEXT:    psrlw $1, %xmm4
   1782 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   1783 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
   1784 ; SSE2-NEXT:    pand %xmm13, %xmm4
   1785 ; SSE2-NEXT:    por %xmm2, %xmm4
   1786 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1787 ; SSE2-NEXT:    psrlw $3, %xmm5
   1788 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   1789 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
   1790 ; SSE2-NEXT:    pand %xmm6, %xmm5
   1791 ; SSE2-NEXT:    por %xmm4, %xmm5
   1792 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1793 ; SSE2-NEXT:    psrlw $5, %xmm7
   1794 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   1795 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
   1796 ; SSE2-NEXT:    pand %xmm2, %xmm7
   1797 ; SSE2-NEXT:    por %xmm5, %xmm7
   1798 ; SSE2-NEXT:    psrlw $7, %xmm0
   1799 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   1800 ; SSE2-NEXT:    pand %xmm4, %xmm4
   1801 ; SSE2-NEXT:    pand %xmm4, %xmm0
   1802 ; SSE2-NEXT:    por %xmm7, %xmm0
   1803 ; SSE2-NEXT:    por %xmm3, %xmm0
   1804 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1805 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
   1806 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
   1807 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
   1808 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
   1809 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
   1810 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
   1811 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
   1812 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1813 ; SSE2-NEXT:    psllw $5, %xmm5
   1814 ; SSE2-NEXT:    pand %xmm10, %xmm5
   1815 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1816 ; SSE2-NEXT:    psllw $7, %xmm3
   1817 ; SSE2-NEXT:    pand %xmm11, %xmm3
   1818 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   1819 ; SSE2-NEXT:    psllw $3, %xmm7
   1820 ; SSE2-NEXT:    pand %xmm12, %xmm7
   1821 ; SSE2-NEXT:    por %xmm5, %xmm7
   1822 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1823 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   1824 ; SSE2-NEXT:    pand %xmm8, %xmm5
   1825 ; SSE2-NEXT:    por %xmm7, %xmm5
   1826 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   1827 ; SSE2-NEXT:    psrlw $1, %xmm7
   1828 ; SSE2-NEXT:    pand %xmm13, %xmm7
   1829 ; SSE2-NEXT:    por %xmm5, %xmm7
   1830 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   1831 ; SSE2-NEXT:    psrlw $3, %xmm5
   1832 ; SSE2-NEXT:    pand %xmm6, %xmm5
   1833 ; SSE2-NEXT:    por %xmm7, %xmm5
   1834 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1835 ; SSE2-NEXT:    psrlw $5, %xmm6
   1836 ; SSE2-NEXT:    pand %xmm2, %xmm6
   1837 ; SSE2-NEXT:    por %xmm5, %xmm6
   1838 ; SSE2-NEXT:    psrlw $7, %xmm1
   1839 ; SSE2-NEXT:    pand %xmm4, %xmm1
   1840 ; SSE2-NEXT:    por %xmm6, %xmm1
   1841 ; SSE2-NEXT:    por %xmm3, %xmm1
   1842 ; SSE2-NEXT:    retq
   1843 ;
   1844 ; SSSE3-LABEL: test_bitreverse_v8i32:
   1845 ; SSSE3:       # BB#0:
   1846 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   1847 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
   1848 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1849 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1850 ; SSSE3-NEXT:    pand %xmm5, %xmm2
   1851 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1852 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
   1853 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   1854 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1855 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   1856 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1857 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
   1858 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
   1859 ; SSSE3-NEXT:    por %xmm7, %xmm3
   1860 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
   1861 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1862 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   1863 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   1864 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1865 ; SSSE3-NEXT:    pand %xmm5, %xmm1
   1866 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
   1867 ; SSSE3-NEXT:    por %xmm6, %xmm2
   1868 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
   1869 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
   1870 ; SSSE3-NEXT:    retq
   1871 ;
   1872 ; AVX1-LABEL: test_bitreverse_v8i32:
   1873 ; AVX1:       # BB#0:
   1874 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1875 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   1876 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1877 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1878 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
   1879 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1880 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   1881 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1882 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   1883 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1884 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   1885 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
   1886 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1887 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
   1888 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
   1889 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1890 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   1891 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   1892 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1893 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1894 ; AVX1-NEXT:    retq
   1895 ;
   1896 ; AVX2-LABEL: test_bitreverse_v8i32:
   1897 ; AVX2:       # BB#0:
   1898 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
   1899 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1900 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1901 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1902 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1903 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1904 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1905 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1906 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1907 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1908 ; AVX2-NEXT:    retq
   1909 ;
   1910 ; AVX512-LABEL: test_bitreverse_v8i32:
   1911 ; AVX512:       # BB#0:
   1912 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
   1913 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1914 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1915 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1916 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1917 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1918 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1919 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1920 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1921 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1922 ; AVX512-NEXT:    retq
   1923 ;
   1924 ; XOPAVX1-LABEL: test_bitreverse_v8i32:
   1925 ; XOPAVX1:       # BB#0:
   1926 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1927 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   1928 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1929 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1930 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1931 ; XOPAVX1-NEXT:    retq
   1932 ;
   1933 ; XOPAVX2-LABEL: test_bitreverse_v8i32:
   1934 ; XOPAVX2:       # BB#0:
   1935 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1936 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   1937 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1938 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1939 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1940 ; XOPAVX2-NEXT:    retq
   1941   %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
   1942   ret <8 x i32> %b
   1943 }
   1944 
   1945 define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
   1946 ; SSE2-LABEL: test_bitreverse_v4i64:
   1947 ; SSE2:       # BB#0:
   1948 ; SSE2-NEXT:    pxor %xmm9, %xmm9
   1949 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1950 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
   1951 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   1952 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   1953 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   1954 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
   1955 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1956 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   1957 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
   1958 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
   1959 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1960 ; SSE2-NEXT:    psllw $5, %xmm2
   1961 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   1962 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
   1963 ; SSE2-NEXT:    pand %xmm10, %xmm2
   1964 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1965 ; SSE2-NEXT:    psllw $7, %xmm4
   1966 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   1967 ; SSE2-NEXT:    pand %xmm11, %xmm11
   1968 ; SSE2-NEXT:    pand %xmm11, %xmm4
   1969 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1970 ; SSE2-NEXT:    psllw $3, %xmm3
   1971 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   1972 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   1973 ; SSE2-NEXT:    pand %xmm12, %xmm3
   1974 ; SSE2-NEXT:    por %xmm2, %xmm3
   1975 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1976 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1977 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   1978 ; SSE2-NEXT:    pand %xmm8, %xmm2
   1979 ; SSE2-NEXT:    por %xmm3, %xmm2
   1980 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1981 ; SSE2-NEXT:    psrlw $1, %xmm3
   1982 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   1983 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
   1984 ; SSE2-NEXT:    pand %xmm13, %xmm3
   1985 ; SSE2-NEXT:    por %xmm2, %xmm3
   1986 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1987 ; SSE2-NEXT:    psrlw $3, %xmm5
   1988 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   1989 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm6
   1990 ; SSE2-NEXT:    pand %xmm6, %xmm5
   1991 ; SSE2-NEXT:    por %xmm3, %xmm5
   1992 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1993 ; SSE2-NEXT:    psrlw $5, %xmm7
   1994 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   1995 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
   1996 ; SSE2-NEXT:    pand %xmm2, %xmm7
   1997 ; SSE2-NEXT:    por %xmm5, %xmm7
   1998 ; SSE2-NEXT:    psrlw $7, %xmm0
   1999 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   2000 ; SSE2-NEXT:    pand %xmm3, %xmm3
   2001 ; SSE2-NEXT:    pand %xmm3, %xmm0
   2002 ; SSE2-NEXT:    por %xmm7, %xmm0
   2003 ; SSE2-NEXT:    por %xmm4, %xmm0
   2004 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2005 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2006 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
   2007 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   2008 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   2009 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
   2010 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2011 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
   2012 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
   2013 ; SSE2-NEXT:    packuswb %xmm4, %xmm1
   2014 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2015 ; SSE2-NEXT:    psllw $5, %xmm5
   2016 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2017 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2018 ; SSE2-NEXT:    psllw $7, %xmm4
   2019 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2020 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   2021 ; SSE2-NEXT:    psllw $3, %xmm7
   2022 ; SSE2-NEXT:    pand %xmm12, %xmm7
   2023 ; SSE2-NEXT:    por %xmm5, %xmm7
   2024 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2025 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2026 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2027 ; SSE2-NEXT:    por %xmm7, %xmm5
   2028 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   2029 ; SSE2-NEXT:    psrlw $1, %xmm7
   2030 ; SSE2-NEXT:    pand %xmm13, %xmm7
   2031 ; SSE2-NEXT:    por %xmm5, %xmm7
   2032 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2033 ; SSE2-NEXT:    psrlw $3, %xmm5
   2034 ; SSE2-NEXT:    pand %xmm6, %xmm5
   2035 ; SSE2-NEXT:    por %xmm7, %xmm5
   2036 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2037 ; SSE2-NEXT:    psrlw $5, %xmm6
   2038 ; SSE2-NEXT:    pand %xmm2, %xmm6
   2039 ; SSE2-NEXT:    por %xmm5, %xmm6
   2040 ; SSE2-NEXT:    psrlw $7, %xmm1
   2041 ; SSE2-NEXT:    pand %xmm3, %xmm1
   2042 ; SSE2-NEXT:    por %xmm6, %xmm1
   2043 ; SSE2-NEXT:    por %xmm4, %xmm1
   2044 ; SSE2-NEXT:    retq
   2045 ;
   2046 ; SSSE3-LABEL: test_bitreverse_v4i64:
   2047 ; SSSE3:       # BB#0:
   2048 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   2049 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
   2050 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2051 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   2052 ; SSSE3-NEXT:    pand %xmm5, %xmm2
   2053 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2054 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
   2055 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   2056 ; SSSE3-NEXT:    psrlw $4, %xmm0
   2057 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   2058 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2059 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
   2060 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
   2061 ; SSSE3-NEXT:    por %xmm7, %xmm3
   2062 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
   2063 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   2064 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   2065 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   2066 ; SSSE3-NEXT:    psrlw $4, %xmm1
   2067 ; SSSE3-NEXT:    pand %xmm5, %xmm1
   2068 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
   2069 ; SSSE3-NEXT:    por %xmm6, %xmm2
   2070 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
   2071 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
   2072 ; SSSE3-NEXT:    retq
   2073 ;
   2074 ; AVX1-LABEL: test_bitreverse_v4i64:
   2075 ; AVX1:       # BB#0:
   2076 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2077 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   2078 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2079 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2080 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
   2081 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2082 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   2083 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   2084 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   2085 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2086 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   2087 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
   2088 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2089 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
   2090 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
   2091 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   2092 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   2093 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   2094 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
   2095 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2096 ; AVX1-NEXT:    retq
   2097 ;
   2098 ; AVX2-LABEL: test_bitreverse_v4i64:
   2099 ; AVX2:       # BB#0:
   2100 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
   2101 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2102 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
   2103 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2104 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   2105 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2106 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   2107 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2108 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   2109 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
   2110 ; AVX2-NEXT:    retq
   2111 ;
   2112 ; AVX512-LABEL: test_bitreverse_v4i64:
   2113 ; AVX512:       # BB#0:
   2114 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
   2115 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2116 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
   2117 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2118 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   2119 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2120 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   2121 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2122 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   2123 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
   2124 ; AVX512-NEXT:    retq
   2125 ;
   2126 ; XOPAVX1-LABEL: test_bitreverse_v4i64:
   2127 ; XOPAVX1:       # BB#0:
   2128 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   2129 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   2130 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   2131 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   2132 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2133 ; XOPAVX1-NEXT:    retq
   2134 ;
   2135 ; XOPAVX2-LABEL: test_bitreverse_v4i64:
   2136 ; XOPAVX2:       # BB#0:
   2137 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2138 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   2139 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   2140 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   2141 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2142 ; XOPAVX2-NEXT:    retq
   2143   %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
   2144   ret <4 x i64> %b
   2145 }
   2146 
   2147 define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
   2148 ; SSE2-LABEL: test_bitreverse_v64i8:
   2149 ; SSE2:       # BB#0:
   2150 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2151 ; SSE2-NEXT:    psllw $5, %xmm4
   2152 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   2153 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm9
   2154 ; SSE2-NEXT:    pand %xmm9, %xmm4
   2155 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   2156 ; SSE2-NEXT:    psllw $7, %xmm7
   2157 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   2158 ; SSE2-NEXT:    pand %xmm10, %xmm10
   2159 ; SSE2-NEXT:    pand %xmm10, %xmm7
   2160 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2161 ; SSE2-NEXT:    psllw $3, %xmm5
   2162 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   2163 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm11
   2164 ; SSE2-NEXT:    pand %xmm11, %xmm5
   2165 ; SSE2-NEXT:    por %xmm4, %xmm5
   2166 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2167 ; SSE2-NEXT:    paddb %xmm4, %xmm4
   2168 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   2169 ; SSE2-NEXT:    pand %xmm8, %xmm4
   2170 ; SSE2-NEXT:    por %xmm5, %xmm4
   2171 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2172 ; SSE2-NEXT:    psrlw $1, %xmm5
   2173 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   2174 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   2175 ; SSE2-NEXT:    pand %xmm12, %xmm5
   2176 ; SSE2-NEXT:    por %xmm4, %xmm5
   2177 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2178 ; SSE2-NEXT:    psrlw $3, %xmm6
   2179 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   2180 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
   2181 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2182 ; SSE2-NEXT:    por %xmm5, %xmm6
   2183 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2184 ; SSE2-NEXT:    psrlw $5, %xmm4
   2185 ; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   2186 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
   2187 ; SSE2-NEXT:    pand %xmm14, %xmm4
   2188 ; SSE2-NEXT:    por %xmm6, %xmm4
   2189 ; SSE2-NEXT:    psrlw $7, %xmm0
   2190 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   2191 ; SSE2-NEXT:    pand %xmm6, %xmm6
   2192 ; SSE2-NEXT:    pand %xmm6, %xmm0
   2193 ; SSE2-NEXT:    por %xmm4, %xmm0
   2194 ; SSE2-NEXT:    por %xmm7, %xmm0
   2195 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2196 ; SSE2-NEXT:    psllw $5, %xmm4
   2197 ; SSE2-NEXT:    pand %xmm9, %xmm4
   2198 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   2199 ; SSE2-NEXT:    psllw $7, %xmm7
   2200 ; SSE2-NEXT:    pand %xmm10, %xmm7
   2201 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2202 ; SSE2-NEXT:    psllw $3, %xmm5
   2203 ; SSE2-NEXT:    pand %xmm11, %xmm5
   2204 ; SSE2-NEXT:    por %xmm4, %xmm5
   2205 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2206 ; SSE2-NEXT:    paddb %xmm4, %xmm4
   2207 ; SSE2-NEXT:    pand %xmm8, %xmm4
   2208 ; SSE2-NEXT:    por %xmm5, %xmm4
   2209 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2210 ; SSE2-NEXT:    psrlw $1, %xmm5
   2211 ; SSE2-NEXT:    pand %xmm12, %xmm5
   2212 ; SSE2-NEXT:    por %xmm4, %xmm5
   2213 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2214 ; SSE2-NEXT:    psrlw $3, %xmm4
   2215 ; SSE2-NEXT:    pand %xmm13, %xmm4
   2216 ; SSE2-NEXT:    por %xmm5, %xmm4
   2217 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2218 ; SSE2-NEXT:    psrlw $5, %xmm5
   2219 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2220 ; SSE2-NEXT:    por %xmm4, %xmm5
   2221 ; SSE2-NEXT:    psrlw $7, %xmm1
   2222 ; SSE2-NEXT:    pand %xmm6, %xmm1
   2223 ; SSE2-NEXT:    por %xmm5, %xmm1
   2224 ; SSE2-NEXT:    por %xmm7, %xmm1
   2225 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2226 ; SSE2-NEXT:    psllw $5, %xmm4
   2227 ; SSE2-NEXT:    pand %xmm9, %xmm4
   2228 ; SSE2-NEXT:    movdqa %xmm2, %xmm7
   2229 ; SSE2-NEXT:    psllw $7, %xmm7
   2230 ; SSE2-NEXT:    pand %xmm10, %xmm7
   2231 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2232 ; SSE2-NEXT:    psllw $3, %xmm5
   2233 ; SSE2-NEXT:    pand %xmm11, %xmm5
   2234 ; SSE2-NEXT:    por %xmm4, %xmm5
   2235 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2236 ; SSE2-NEXT:    paddb %xmm4, %xmm4
   2237 ; SSE2-NEXT:    pand %xmm8, %xmm4
   2238 ; SSE2-NEXT:    por %xmm5, %xmm4
   2239 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2240 ; SSE2-NEXT:    psrlw $1, %xmm5
   2241 ; SSE2-NEXT:    pand %xmm12, %xmm5
   2242 ; SSE2-NEXT:    por %xmm4, %xmm5
   2243 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2244 ; SSE2-NEXT:    psrlw $3, %xmm4
   2245 ; SSE2-NEXT:    pand %xmm13, %xmm4
   2246 ; SSE2-NEXT:    por %xmm5, %xmm4
   2247 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2248 ; SSE2-NEXT:    psrlw $5, %xmm5
   2249 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2250 ; SSE2-NEXT:    por %xmm4, %xmm5
   2251 ; SSE2-NEXT:    psrlw $7, %xmm2
   2252 ; SSE2-NEXT:    pand %xmm6, %xmm2
   2253 ; SSE2-NEXT:    por %xmm5, %xmm2
   2254 ; SSE2-NEXT:    por %xmm7, %xmm2
   2255 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2256 ; SSE2-NEXT:    psllw $5, %xmm4
   2257 ; SSE2-NEXT:    pand %xmm9, %xmm4
   2258 ; SSE2-NEXT:    movdqa %xmm3, %xmm7
   2259 ; SSE2-NEXT:    psllw $7, %xmm7
   2260 ; SSE2-NEXT:    pand %xmm10, %xmm7
   2261 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2262 ; SSE2-NEXT:    psllw $3, %xmm5
   2263 ; SSE2-NEXT:    pand %xmm11, %xmm5
   2264 ; SSE2-NEXT:    por %xmm4, %xmm5
   2265 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2266 ; SSE2-NEXT:    paddb %xmm4, %xmm4
   2267 ; SSE2-NEXT:    pand %xmm8, %xmm4
   2268 ; SSE2-NEXT:    por %xmm5, %xmm4
   2269 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2270 ; SSE2-NEXT:    psrlw $1, %xmm5
   2271 ; SSE2-NEXT:    pand %xmm12, %xmm5
   2272 ; SSE2-NEXT:    por %xmm4, %xmm5
   2273 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2274 ; SSE2-NEXT:    psrlw $3, %xmm4
   2275 ; SSE2-NEXT:    pand %xmm13, %xmm4
   2276 ; SSE2-NEXT:    por %xmm5, %xmm4
   2277 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2278 ; SSE2-NEXT:    psrlw $5, %xmm5
   2279 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2280 ; SSE2-NEXT:    por %xmm4, %xmm5
   2281 ; SSE2-NEXT:    psrlw $7, %xmm3
   2282 ; SSE2-NEXT:    pand %xmm6, %xmm3
   2283 ; SSE2-NEXT:    por %xmm5, %xmm3
   2284 ; SSE2-NEXT:    por %xmm7, %xmm3
   2285 ; SSE2-NEXT:    retq
   2286 ;
   2287 ; SSSE3-LABEL: test_bitreverse_v64i8:
   2288 ; SSSE3:       # BB#0:
   2289 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
   2290 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2291 ; SSSE3-NEXT:    pand %xmm8, %xmm0
   2292 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2293 ; SSSE3-NEXT:    movdqa %xmm9, %xmm6
   2294 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   2295 ; SSSE3-NEXT:    psrlw $4, %xmm5
   2296 ; SSSE3-NEXT:    pand %xmm8, %xmm5
   2297 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2298 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   2299 ; SSSE3-NEXT:    pshufb %xmm5, %xmm0
   2300 ; SSSE3-NEXT:    por %xmm6, %xmm0
   2301 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   2302 ; SSSE3-NEXT:    pand %xmm8, %xmm5
   2303 ; SSSE3-NEXT:    movdqa %xmm9, %xmm6
   2304 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   2305 ; SSSE3-NEXT:    psrlw $4, %xmm1
   2306 ; SSSE3-NEXT:    pand %xmm8, %xmm1
   2307 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   2308 ; SSSE3-NEXT:    pshufb %xmm1, %xmm5
   2309 ; SSSE3-NEXT:    por %xmm6, %xmm5
   2310 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
   2311 ; SSSE3-NEXT:    pand %xmm8, %xmm1
   2312 ; SSSE3-NEXT:    movdqa %xmm9, %xmm7
   2313 ; SSSE3-NEXT:    pshufb %xmm1, %xmm7
   2314 ; SSSE3-NEXT:    psrlw $4, %xmm2
   2315 ; SSSE3-NEXT:    pand %xmm8, %xmm2
   2316 ; SSSE3-NEXT:    movdqa %xmm4, %xmm6
   2317 ; SSSE3-NEXT:    pshufb %xmm2, %xmm6
   2318 ; SSSE3-NEXT:    por %xmm7, %xmm6
   2319 ; SSSE3-NEXT:    movdqa %xmm3, %xmm1
   2320 ; SSSE3-NEXT:    pand %xmm8, %xmm1
   2321 ; SSSE3-NEXT:    pshufb %xmm1, %xmm9
   2322 ; SSSE3-NEXT:    psrlw $4, %xmm3
   2323 ; SSSE3-NEXT:    pand %xmm8, %xmm3
   2324 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   2325 ; SSSE3-NEXT:    por %xmm9, %xmm4
   2326 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   2327 ; SSSE3-NEXT:    movdqa %xmm6, %xmm2
   2328 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   2329 ; SSSE3-NEXT:    retq
   2330 ;
   2331 ; AVX1-LABEL: test_bitreverse_v64i8:
   2332 ; AVX1:       # BB#0:
   2333 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2334 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2335 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
   2336 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2337 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   2338 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   2339 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   2340 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2341 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
   2342 ; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
   2343 ; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm4
   2344 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   2345 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   2346 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   2347 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   2348 ; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
   2349 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2350 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2351 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
   2352 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   2353 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   2354 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   2355 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
   2356 ; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
   2357 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm4
   2358 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   2359 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   2360 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   2361 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   2362 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
   2363 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2364 ; AVX1-NEXT:    retq
   2365 ;
   2366 ; AVX2-LABEL: test_bitreverse_v64i8:
   2367 ; AVX2:       # BB#0:
   2368 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2369 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
   2370 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2371 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   2372 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2373 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
   2374 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2375 ; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
   2376 ; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
   2377 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
   2378 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   2379 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   2380 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
   2381 ; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
   2382 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
   2383 ; AVX2-NEXT:    retq
   2384 ;
   2385 ; AVX512F-LABEL: test_bitreverse_v64i8:
   2386 ; AVX512F:       # BB#0:
   2387 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2388 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
   2389 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2390 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   2391 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2392 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
   2393 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2394 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
   2395 ; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
   2396 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
   2397 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   2398 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
   2399 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
   2400 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
   2401 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
   2402 ; AVX512F-NEXT:    retq
   2403 ;
   2404 ; AVX512BW-LABEL: test_bitreverse_v64i8:
   2405 ; AVX512BW:       # BB#0:
   2406 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2407 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   2408 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2409 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   2410 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   2411 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   2412 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2413 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   2414 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   2415 ; AVX512BW-NEXT:    retq
   2416 ;
   2417 ; XOPAVX1-LABEL: test_bitreverse_v64i8:
   2418 ; XOPAVX1:       # BB#0:
   2419 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2420 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
   2421 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2422 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2423 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2424 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2425 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2426 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2427 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2428 ; XOPAVX1-NEXT:    retq
   2429 ;
   2430 ; XOPAVX2-LABEL: test_bitreverse_v64i8:
   2431 ; XOPAVX2:       # BB#0:
   2432 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2433 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
   2434 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2435 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2436 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   2437 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2438 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2439 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2440 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   2441 ; XOPAVX2-NEXT:    retq
   2442   %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
   2443   ret <64 x i8> %b
   2444 }
   2445 
   2446 define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
   2447 ; SSE2-LABEL: test_bitreverse_v32i16:
   2448 ; SSE2:       # BB#0:
   2449 ; SSE2-NEXT:    pxor %xmm9, %xmm9
   2450 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2451 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2452 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
   2453 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
   2454 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
   2455 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
   2456 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
   2457 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
   2458 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2459 ; SSE2-NEXT:    psllw $5, %xmm5
   2460 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   2461 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
   2462 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2463 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2464 ; SSE2-NEXT:    psllw $7, %xmm4
   2465 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   2466 ; SSE2-NEXT:    pand %xmm11, %xmm11
   2467 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2468 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2469 ; SSE2-NEXT:    psllw $3, %xmm6
   2470 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   2471 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   2472 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2473 ; SSE2-NEXT:    por %xmm5, %xmm6
   2474 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2475 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2476 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   2477 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2478 ; SSE2-NEXT:    por %xmm6, %xmm5
   2479 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2480 ; SSE2-NEXT:    psrlw $1, %xmm6
   2481 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   2482 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
   2483 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2484 ; SSE2-NEXT:    por %xmm5, %xmm6
   2485 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   2486 ; SSE2-NEXT:    psrlw $3, %xmm7
   2487 ; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   2488 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
   2489 ; SSE2-NEXT:    pand %xmm14, %xmm7
   2490 ; SSE2-NEXT:    por %xmm6, %xmm7
   2491 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2492 ; SSE2-NEXT:    psrlw $5, %xmm5
   2493 ; SSE2-NEXT:    movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   2494 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm15
   2495 ; SSE2-NEXT:    pand %xmm15, %xmm5
   2496 ; SSE2-NEXT:    por %xmm7, %xmm5
   2497 ; SSE2-NEXT:    psrlw $7, %xmm0
   2498 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   2499 ; SSE2-NEXT:    pand %xmm7, %xmm7
   2500 ; SSE2-NEXT:    pand %xmm7, %xmm0
   2501 ; SSE2-NEXT:    por %xmm5, %xmm0
   2502 ; SSE2-NEXT:    por %xmm4, %xmm0
   2503 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2504 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2505 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
   2506 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
   2507 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
   2508 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
   2509 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
   2510 ; SSE2-NEXT:    packuswb %xmm4, %xmm1
   2511 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2512 ; SSE2-NEXT:    psllw $5, %xmm5
   2513 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2514 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2515 ; SSE2-NEXT:    psllw $7, %xmm4
   2516 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2517 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2518 ; SSE2-NEXT:    psllw $3, %xmm6
   2519 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2520 ; SSE2-NEXT:    por %xmm5, %xmm6
   2521 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2522 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2523 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2524 ; SSE2-NEXT:    por %xmm6, %xmm5
   2525 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2526 ; SSE2-NEXT:    psrlw $1, %xmm6
   2527 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2528 ; SSE2-NEXT:    por %xmm5, %xmm6
   2529 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2530 ; SSE2-NEXT:    psrlw $3, %xmm5
   2531 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2532 ; SSE2-NEXT:    por %xmm6, %xmm5
   2533 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2534 ; SSE2-NEXT:    psrlw $5, %xmm6
   2535 ; SSE2-NEXT:    pand %xmm15, %xmm6
   2536 ; SSE2-NEXT:    por %xmm5, %xmm6
   2537 ; SSE2-NEXT:    psrlw $7, %xmm1
   2538 ; SSE2-NEXT:    pand %xmm7, %xmm1
   2539 ; SSE2-NEXT:    por %xmm6, %xmm1
   2540 ; SSE2-NEXT:    por %xmm4, %xmm1
   2541 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2542 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2543 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
   2544 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
   2545 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
   2546 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
   2547 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
   2548 ; SSE2-NEXT:    packuswb %xmm4, %xmm2
   2549 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2550 ; SSE2-NEXT:    psllw $5, %xmm5
   2551 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2552 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2553 ; SSE2-NEXT:    psllw $7, %xmm4
   2554 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2555 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2556 ; SSE2-NEXT:    psllw $3, %xmm6
   2557 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2558 ; SSE2-NEXT:    por %xmm5, %xmm6
   2559 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2560 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2561 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2562 ; SSE2-NEXT:    por %xmm6, %xmm5
   2563 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2564 ; SSE2-NEXT:    psrlw $1, %xmm6
   2565 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2566 ; SSE2-NEXT:    por %xmm5, %xmm6
   2567 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2568 ; SSE2-NEXT:    psrlw $3, %xmm5
   2569 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2570 ; SSE2-NEXT:    por %xmm6, %xmm5
   2571 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2572 ; SSE2-NEXT:    psrlw $5, %xmm6
   2573 ; SSE2-NEXT:    pand %xmm15, %xmm6
   2574 ; SSE2-NEXT:    por %xmm5, %xmm6
   2575 ; SSE2-NEXT:    psrlw $7, %xmm2
   2576 ; SSE2-NEXT:    pand %xmm7, %xmm2
   2577 ; SSE2-NEXT:    por %xmm6, %xmm2
   2578 ; SSE2-NEXT:    por %xmm4, %xmm2
   2579 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2580 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2581 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
   2582 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
   2583 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
   2584 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
   2585 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
   2586 ; SSE2-NEXT:    packuswb %xmm4, %xmm3
   2587 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2588 ; SSE2-NEXT:    psllw $5, %xmm5
   2589 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2590 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2591 ; SSE2-NEXT:    psllw $7, %xmm4
   2592 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2593 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2594 ; SSE2-NEXT:    psllw $3, %xmm6
   2595 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2596 ; SSE2-NEXT:    por %xmm5, %xmm6
   2597 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2598 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2599 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2600 ; SSE2-NEXT:    por %xmm6, %xmm5
   2601 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2602 ; SSE2-NEXT:    psrlw $1, %xmm6
   2603 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2604 ; SSE2-NEXT:    por %xmm5, %xmm6
   2605 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2606 ; SSE2-NEXT:    psrlw $3, %xmm5
   2607 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2608 ; SSE2-NEXT:    por %xmm6, %xmm5
   2609 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2610 ; SSE2-NEXT:    psrlw $5, %xmm6
   2611 ; SSE2-NEXT:    pand %xmm15, %xmm6
   2612 ; SSE2-NEXT:    por %xmm5, %xmm6
   2613 ; SSE2-NEXT:    psrlw $7, %xmm3
   2614 ; SSE2-NEXT:    pand %xmm7, %xmm3
   2615 ; SSE2-NEXT:    por %xmm6, %xmm3
   2616 ; SSE2-NEXT:    por %xmm4, %xmm3
   2617 ; SSE2-NEXT:    retq
   2618 ;
   2619 ; SSSE3-LABEL: test_bitreverse_v32i16:
   2620 ; SSSE3:       # BB#0:
   2621 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   2622 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
   2623 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   2624 ; SSSE3-NEXT:    pshufb %xmm8, %xmm1
   2625 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2626 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   2627 ; SSSE3-NEXT:    pand %xmm9, %xmm0
   2628 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2629 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2630 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   2631 ; SSSE3-NEXT:    psrlw $4, %xmm1
   2632 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   2633 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2634 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   2635 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   2636 ; SSSE3-NEXT:    por %xmm6, %xmm0
   2637 ; SSSE3-NEXT:    pshufb %xmm8, %xmm5
   2638 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   2639 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   2640 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2641 ; SSSE3-NEXT:    pshufb %xmm1, %xmm6
   2642 ; SSSE3-NEXT:    psrlw $4, %xmm5
   2643 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   2644 ; SSSE3-NEXT:    movdqa %xmm4, %xmm1
   2645 ; SSSE3-NEXT:    pshufb %xmm5, %xmm1
   2646 ; SSSE3-NEXT:    por %xmm6, %xmm1
   2647 ; SSSE3-NEXT:    pshufb %xmm8, %xmm2
   2648 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   2649 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   2650 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2651 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   2652 ; SSSE3-NEXT:    psrlw $4, %xmm2
   2653 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   2654 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   2655 ; SSSE3-NEXT:    pshufb %xmm2, %xmm5
   2656 ; SSSE3-NEXT:    por %xmm6, %xmm5
   2657 ; SSSE3-NEXT:    pshufb %xmm8, %xmm3
   2658 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
   2659 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   2660 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   2661 ; SSSE3-NEXT:    psrlw $4, %xmm3
   2662 ; SSSE3-NEXT:    pand %xmm9, %xmm3
   2663 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   2664 ; SSSE3-NEXT:    por %xmm7, %xmm4
   2665 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
   2666 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   2667 ; SSSE3-NEXT:    retq
   2668 ;
   2669 ; AVX1-LABEL: test_bitreverse_v32i16:
   2670 ; AVX1:       # BB#0:
   2671 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2672 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   2673 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   2674 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2675 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   2676 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2677 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   2678 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   2679 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2680 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2681 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   2682 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   2683 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   2684 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
   2685 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   2686 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   2687 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2688 ; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
   2689 ; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
   2690 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2691 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2692 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   2693 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   2694 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   2695 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   2696 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2697 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   2698 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   2699 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   2700 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
   2701 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
   2702 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   2703 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   2704 ; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
   2705 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
   2706 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2707 ; AVX1-NEXT:    retq
   2708 ;
   2709 ; AVX2-LABEL: test_bitreverse_v32i16:
   2710 ; AVX2:       # BB#0:
   2711 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   2712 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   2713 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2714 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
   2715 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2716 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   2717 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2718 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
   2719 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2720 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   2721 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
   2722 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   2723 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
   2724 ; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   2725 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   2726 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
   2727 ; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   2728 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
   2729 ; AVX2-NEXT:    retq
   2730 ;
   2731 ; AVX512F-LABEL: test_bitreverse_v32i16:
   2732 ; AVX512F:       # BB#0:
   2733 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   2734 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   2735 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2736 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm4
   2737 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2738 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   2739 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2740 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
   2741 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2742 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   2743 ; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
   2744 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   2745 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm2
   2746 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   2747 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
   2748 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
   2749 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   2750 ; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
   2751 ; AVX512F-NEXT:    retq
   2752 ;
   2753 ; AVX512BW-LABEL: test_bitreverse_v32i16:
   2754 ; AVX512BW:       # BB#0:
   2755 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
   2756 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2757 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   2758 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2759 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   2760 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   2761 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   2762 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2763 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   2764 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   2765 ; AVX512BW-NEXT:    retq
   2766 ;
   2767 ; XOPAVX1-LABEL: test_bitreverse_v32i16:
   2768 ; XOPAVX1:       # BB#0:
   2769 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2770 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
   2771 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2772 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2773 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2774 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2775 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2776 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2777 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2778 ; XOPAVX1-NEXT:    retq
   2779 ;
   2780 ; XOPAVX2-LABEL: test_bitreverse_v32i16:
   2781 ; XOPAVX2:       # BB#0:
   2782 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2783 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
   2784 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2785 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2786 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   2787 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2788 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2789 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2790 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   2791 ; XOPAVX2-NEXT:    retq
   2792   %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
   2793   ret <32 x i16> %b
   2794 }
   2795 
   2796 define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
   2797 ; SSE2-LABEL: test_bitreverse_v16i32:
   2798 ; SSE2:       # BB#0:
   2799 ; SSE2-NEXT:    pxor %xmm9, %xmm9
   2800 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2801 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2802 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   2803 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   2804 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
   2805 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   2806 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
   2807 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
   2808 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2809 ; SSE2-NEXT:    psllw $5, %xmm5
   2810 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   2811 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
   2812 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2813 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2814 ; SSE2-NEXT:    psllw $7, %xmm4
   2815 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   2816 ; SSE2-NEXT:    pand %xmm11, %xmm11
   2817 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2818 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2819 ; SSE2-NEXT:    psllw $3, %xmm6
   2820 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   2821 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   2822 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2823 ; SSE2-NEXT:    por %xmm5, %xmm6
   2824 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2825 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2826 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   2827 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2828 ; SSE2-NEXT:    por %xmm6, %xmm5
   2829 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2830 ; SSE2-NEXT:    psrlw $1, %xmm6
   2831 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   2832 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
   2833 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2834 ; SSE2-NEXT:    por %xmm5, %xmm6
   2835 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   2836 ; SSE2-NEXT:    psrlw $3, %xmm7
   2837 ; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   2838 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
   2839 ; SSE2-NEXT:    pand %xmm14, %xmm7
   2840 ; SSE2-NEXT:    por %xmm6, %xmm7
   2841 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2842 ; SSE2-NEXT:    psrlw $5, %xmm5
   2843 ; SSE2-NEXT:    movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   2844 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm15
   2845 ; SSE2-NEXT:    pand %xmm15, %xmm5
   2846 ; SSE2-NEXT:    por %xmm7, %xmm5
   2847 ; SSE2-NEXT:    psrlw $7, %xmm0
   2848 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   2849 ; SSE2-NEXT:    pand %xmm7, %xmm7
   2850 ; SSE2-NEXT:    pand %xmm7, %xmm0
   2851 ; SSE2-NEXT:    por %xmm5, %xmm0
   2852 ; SSE2-NEXT:    por %xmm4, %xmm0
   2853 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2854 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2855 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   2856 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   2857 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
   2858 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
   2859 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
   2860 ; SSE2-NEXT:    packuswb %xmm4, %xmm1
   2861 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2862 ; SSE2-NEXT:    psllw $5, %xmm5
   2863 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2864 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2865 ; SSE2-NEXT:    psllw $7, %xmm4
   2866 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2867 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2868 ; SSE2-NEXT:    psllw $3, %xmm6
   2869 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2870 ; SSE2-NEXT:    por %xmm5, %xmm6
   2871 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2872 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2873 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2874 ; SSE2-NEXT:    por %xmm6, %xmm5
   2875 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2876 ; SSE2-NEXT:    psrlw $1, %xmm6
   2877 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2878 ; SSE2-NEXT:    por %xmm5, %xmm6
   2879 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2880 ; SSE2-NEXT:    psrlw $3, %xmm5
   2881 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2882 ; SSE2-NEXT:    por %xmm6, %xmm5
   2883 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2884 ; SSE2-NEXT:    psrlw $5, %xmm6
   2885 ; SSE2-NEXT:    pand %xmm15, %xmm6
   2886 ; SSE2-NEXT:    por %xmm5, %xmm6
   2887 ; SSE2-NEXT:    psrlw $7, %xmm1
   2888 ; SSE2-NEXT:    pand %xmm7, %xmm1
   2889 ; SSE2-NEXT:    por %xmm6, %xmm1
   2890 ; SSE2-NEXT:    por %xmm4, %xmm1
   2891 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2892 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2893 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   2894 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   2895 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
   2896 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   2897 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   2898 ; SSE2-NEXT:    packuswb %xmm4, %xmm2
   2899 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2900 ; SSE2-NEXT:    psllw $5, %xmm5
   2901 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2902 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2903 ; SSE2-NEXT:    psllw $7, %xmm4
   2904 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2905 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2906 ; SSE2-NEXT:    psllw $3, %xmm6
   2907 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2908 ; SSE2-NEXT:    por %xmm5, %xmm6
   2909 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2910 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2911 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2912 ; SSE2-NEXT:    por %xmm6, %xmm5
   2913 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2914 ; SSE2-NEXT:    psrlw $1, %xmm6
   2915 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2916 ; SSE2-NEXT:    por %xmm5, %xmm6
   2917 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2918 ; SSE2-NEXT:    psrlw $3, %xmm5
   2919 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2920 ; SSE2-NEXT:    por %xmm6, %xmm5
   2921 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2922 ; SSE2-NEXT:    psrlw $5, %xmm6
   2923 ; SSE2-NEXT:    pand %xmm15, %xmm6
   2924 ; SSE2-NEXT:    por %xmm5, %xmm6
   2925 ; SSE2-NEXT:    psrlw $7, %xmm2
   2926 ; SSE2-NEXT:    pand %xmm7, %xmm2
   2927 ; SSE2-NEXT:    por %xmm6, %xmm2
   2928 ; SSE2-NEXT:    por %xmm4, %xmm2
   2929 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2930 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   2931 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   2932 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   2933 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
   2934 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
   2935 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
   2936 ; SSE2-NEXT:    packuswb %xmm4, %xmm3
   2937 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2938 ; SSE2-NEXT:    psllw $5, %xmm5
   2939 ; SSE2-NEXT:    pand %xmm10, %xmm5
   2940 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   2941 ; SSE2-NEXT:    psllw $7, %xmm4
   2942 ; SSE2-NEXT:    pand %xmm11, %xmm4
   2943 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2944 ; SSE2-NEXT:    psllw $3, %xmm6
   2945 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2946 ; SSE2-NEXT:    por %xmm5, %xmm6
   2947 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2948 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   2949 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2950 ; SSE2-NEXT:    por %xmm6, %xmm5
   2951 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2952 ; SSE2-NEXT:    psrlw $1, %xmm6
   2953 ; SSE2-NEXT:    pand %xmm13, %xmm6
   2954 ; SSE2-NEXT:    por %xmm5, %xmm6
   2955 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2956 ; SSE2-NEXT:    psrlw $3, %xmm5
   2957 ; SSE2-NEXT:    pand %xmm14, %xmm5
   2958 ; SSE2-NEXT:    por %xmm6, %xmm5
   2959 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2960 ; SSE2-NEXT:    psrlw $5, %xmm6
   2961 ; SSE2-NEXT:    pand %xmm15, %xmm6
   2962 ; SSE2-NEXT:    por %xmm5, %xmm6
   2963 ; SSE2-NEXT:    psrlw $7, %xmm3
   2964 ; SSE2-NEXT:    pand %xmm7, %xmm3
   2965 ; SSE2-NEXT:    por %xmm6, %xmm3
   2966 ; SSE2-NEXT:    por %xmm4, %xmm3
   2967 ; SSE2-NEXT:    retq
   2968 ;
   2969 ; SSSE3-LABEL: test_bitreverse_v16i32:
   2970 ; SSSE3:       # BB#0:
   2971 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   2972 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
   2973 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   2974 ; SSSE3-NEXT:    pshufb %xmm8, %xmm1
   2975 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2976 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   2977 ; SSSE3-NEXT:    pand %xmm9, %xmm0
   2978 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2979 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2980 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   2981 ; SSSE3-NEXT:    psrlw $4, %xmm1
   2982 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   2983 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2984 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   2985 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   2986 ; SSSE3-NEXT:    por %xmm6, %xmm0
   2987 ; SSSE3-NEXT:    pshufb %xmm8, %xmm5
   2988 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   2989 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   2990 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2991 ; SSSE3-NEXT:    pshufb %xmm1, %xmm6
   2992 ; SSSE3-NEXT:    psrlw $4, %xmm5
   2993 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   2994 ; SSSE3-NEXT:    movdqa %xmm4, %xmm1
   2995 ; SSSE3-NEXT:    pshufb %xmm5, %xmm1
   2996 ; SSSE3-NEXT:    por %xmm6, %xmm1
   2997 ; SSSE3-NEXT:    pshufb %xmm8, %xmm2
   2998 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   2999 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   3000 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   3001 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   3002 ; SSSE3-NEXT:    psrlw $4, %xmm2
   3003 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   3004 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   3005 ; SSSE3-NEXT:    pshufb %xmm2, %xmm5
   3006 ; SSSE3-NEXT:    por %xmm6, %xmm5
   3007 ; SSSE3-NEXT:    pshufb %xmm8, %xmm3
   3008 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
   3009 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   3010 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   3011 ; SSSE3-NEXT:    psrlw $4, %xmm3
   3012 ; SSSE3-NEXT:    pand %xmm9, %xmm3
   3013 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   3014 ; SSSE3-NEXT:    por %xmm7, %xmm4
   3015 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
   3016 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   3017 ; SSSE3-NEXT:    retq
   3018 ;
   3019 ; AVX1-LABEL: test_bitreverse_v16i32:
   3020 ; AVX1:       # BB#0:
   3021 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3022 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   3023 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   3024 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   3025 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   3026 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   3027 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   3028 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   3029 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   3030 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   3031 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   3032 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   3033 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   3034 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
   3035 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   3036 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   3037 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   3038 ; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
   3039 ; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
   3040 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   3041 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3042 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   3043 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   3044 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   3045 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   3046 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   3047 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   3048 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   3049 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   3050 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
   3051 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
   3052 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   3053 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   3054 ; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
   3055 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
   3056 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   3057 ; AVX1-NEXT:    retq
   3058 ;
   3059 ; AVX2-LABEL: test_bitreverse_v16i32:
   3060 ; AVX2:       # BB#0:
   3061 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   3062 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   3063 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   3064 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
   3065 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   3066 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   3067 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   3068 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
   3069 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   3070 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   3071 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
   3072 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   3073 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
   3074 ; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   3075 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   3076 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
   3077 ; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   3078 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
   3079 ; AVX2-NEXT:    retq
   3080 ;
   3081 ; AVX512F-LABEL: test_bitreverse_v16i32:
   3082 ; AVX512F:       # BB#0:
   3083 ; AVX512F-NEXT:    vpslld $29, %zmm0, %zmm1
   3084 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
   3085 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm2
   3086 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3087 ; AVX512F-NEXT:    vpord %zmm1, %zmm2, %zmm1
   3088 ; AVX512F-NEXT:    vpslld $27, %zmm0, %zmm2
   3089 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3090 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3091 ; AVX512F-NEXT:    vpslld $25, %zmm0, %zmm2
   3092 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3093 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3094 ; AVX512F-NEXT:    vpslld $23, %zmm0, %zmm2
   3095 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3096 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3097 ; AVX512F-NEXT:    vpslld $21, %zmm0, %zmm2
   3098 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3099 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3100 ; AVX512F-NEXT:    vpslld $19, %zmm0, %zmm2
   3101 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3102 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3103 ; AVX512F-NEXT:    vpslld $17, %zmm0, %zmm2
   3104 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3105 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3106 ; AVX512F-NEXT:    vpslld $15, %zmm0, %zmm2
   3107 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3108 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3109 ; AVX512F-NEXT:    vpslld $13, %zmm0, %zmm2
   3110 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3111 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3112 ; AVX512F-NEXT:    vpslld $11, %zmm0, %zmm2
   3113 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3114 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3115 ; AVX512F-NEXT:    vpslld $9, %zmm0, %zmm2
   3116 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3117 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3118 ; AVX512F-NEXT:    vpslld $7, %zmm0, %zmm2
   3119 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3120 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3121 ; AVX512F-NEXT:    vpslld $5, %zmm0, %zmm2
   3122 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3123 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3124 ; AVX512F-NEXT:    vpslld $3, %zmm0, %zmm2
   3125 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3126 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3127 ; AVX512F-NEXT:    vpslld $1, %zmm0, %zmm2
   3128 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3129 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3130 ; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm2
   3131 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3132 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3133 ; AVX512F-NEXT:    vpsrld $3, %zmm0, %zmm2
   3134 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3135 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3136 ; AVX512F-NEXT:    vpsrld $5, %zmm0, %zmm2
   3137 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3138 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3139 ; AVX512F-NEXT:    vpsrld $7, %zmm0, %zmm2
   3140 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3141 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3142 ; AVX512F-NEXT:    vpsrld $9, %zmm0, %zmm2
   3143 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3144 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3145 ; AVX512F-NEXT:    vpsrld $11, %zmm0, %zmm2
   3146 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3147 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3148 ; AVX512F-NEXT:    vpsrld $13, %zmm0, %zmm2
   3149 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3150 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3151 ; AVX512F-NEXT:    vpsrld $15, %zmm0, %zmm2
   3152 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3153 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3154 ; AVX512F-NEXT:    vpsrld $17, %zmm0, %zmm2
   3155 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3156 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3157 ; AVX512F-NEXT:    vpsrld $19, %zmm0, %zmm2
   3158 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3159 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3160 ; AVX512F-NEXT:    vpsrld $21, %zmm0, %zmm2
   3161 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3162 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3163 ; AVX512F-NEXT:    vpsrld $23, %zmm0, %zmm2
   3164 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3165 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3166 ; AVX512F-NEXT:    vpsrld $25, %zmm0, %zmm2
   3167 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3168 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3169 ; AVX512F-NEXT:    vpsrld $27, %zmm0, %zmm2
   3170 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3171 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3172 ; AVX512F-NEXT:    vpsrld $29, %zmm0, %zmm2
   3173 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   3174 ; AVX512F-NEXT:    vpord %zmm2, %zmm1, %zmm1
   3175 ; AVX512F-NEXT:    vpsrld $31, %zmm0, %zmm0
   3176 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   3177 ; AVX512F-NEXT:    vpord %zmm0, %zmm1, %zmm0
   3178 ; AVX512F-NEXT:    retq
   3179 ;
   3180 ; AVX512BW-LABEL: test_bitreverse_v16i32:
   3181 ; AVX512BW:       # BB#0:
   3182 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
   3183 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   3184 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   3185 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   3186 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   3187 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   3188 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   3189 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   3190 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   3191 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   3192 ; AVX512BW-NEXT:    retq
   3193 ;
   3194 ; XOPAVX1-LABEL: test_bitreverse_v16i32:
   3195 ; XOPAVX1:       # BB#0:
   3196 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3197 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   3198 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3199 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   3200 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   3201 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3202 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3203 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   3204 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   3205 ; XOPAVX1-NEXT:    retq
   3206 ;
   3207 ; XOPAVX2-LABEL: test_bitreverse_v16i32:
   3208 ; XOPAVX2:       # BB#0:
   3209 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   3210 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   3211 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3212 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   3213 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   3214 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   3215 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3216 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   3217 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   3218 ; XOPAVX2-NEXT:    retq
   3219   %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
   3220   ret <16 x i32> %b
   3221 }
   3222 
   3223 define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
   3224 ; SSE2-LABEL: test_bitreverse_v8i64:
   3225 ; SSE2:       # BB#0:
   3226 ; SSE2-NEXT:    pxor %xmm9, %xmm9
   3227 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   3228 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   3229 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
   3230 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   3231 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   3232 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
   3233 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   3234 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   3235 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
   3236 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
   3237 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   3238 ; SSE2-NEXT:    psllw $5, %xmm5
   3239 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
   3240 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm10
   3241 ; SSE2-NEXT:    pand %xmm10, %xmm5
   3242 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   3243 ; SSE2-NEXT:    psllw $7, %xmm4
   3244 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
   3245 ; SSE2-NEXT:    pand %xmm11, %xmm11
   3246 ; SSE2-NEXT:    pand %xmm11, %xmm4
   3247 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   3248 ; SSE2-NEXT:    psllw $3, %xmm6
   3249 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
   3250 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm12
   3251 ; SSE2-NEXT:    pand %xmm12, %xmm6
   3252 ; SSE2-NEXT:    por %xmm5, %xmm6
   3253 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   3254 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   3255 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
   3256 ; SSE2-NEXT:    pand %xmm8, %xmm5
   3257 ; SSE2-NEXT:    por %xmm6, %xmm5
   3258 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   3259 ; SSE2-NEXT:    psrlw $1, %xmm6
   3260 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
   3261 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm13
   3262 ; SSE2-NEXT:    pand %xmm13, %xmm6
   3263 ; SSE2-NEXT:    por %xmm5, %xmm6
   3264 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   3265 ; SSE2-NEXT:    psrlw $3, %xmm7
   3266 ; SSE2-NEXT:    movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
   3267 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm14
   3268 ; SSE2-NEXT:    pand %xmm14, %xmm7
   3269 ; SSE2-NEXT:    por %xmm6, %xmm7
   3270 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   3271 ; SSE2-NEXT:    psrlw $5, %xmm5
   3272 ; SSE2-NEXT:    movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
   3273 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm15
   3274 ; SSE2-NEXT:    pand %xmm15, %xmm5
   3275 ; SSE2-NEXT:    por %xmm7, %xmm5
   3276 ; SSE2-NEXT:    psrlw $7, %xmm0
   3277 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
   3278 ; SSE2-NEXT:    pand %xmm7, %xmm7
   3279 ; SSE2-NEXT:    pand %xmm7, %xmm0
   3280 ; SSE2-NEXT:    por %xmm5, %xmm0
   3281 ; SSE2-NEXT:    por %xmm4, %xmm0
   3282 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   3283 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   3284 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
   3285 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   3286 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   3287 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
   3288 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   3289 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
   3290 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
   3291 ; SSE2-NEXT:    packuswb %xmm4, %xmm1
   3292 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   3293 ; SSE2-NEXT:    psllw $5, %xmm5
   3294 ; SSE2-NEXT:    pand %xmm10, %xmm5
   3295 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   3296 ; SSE2-NEXT:    psllw $7, %xmm4
   3297 ; SSE2-NEXT:    pand %xmm11, %xmm4
   3298 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   3299 ; SSE2-NEXT:    psllw $3, %xmm6
   3300 ; SSE2-NEXT:    pand %xmm12, %xmm6
   3301 ; SSE2-NEXT:    por %xmm5, %xmm6
   3302 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   3303 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   3304 ; SSE2-NEXT:    pand %xmm8, %xmm5
   3305 ; SSE2-NEXT:    por %xmm6, %xmm5
   3306 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   3307 ; SSE2-NEXT:    psrlw $1, %xmm6
   3308 ; SSE2-NEXT:    pand %xmm13, %xmm6
   3309 ; SSE2-NEXT:    por %xmm5, %xmm6
   3310 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   3311 ; SSE2-NEXT:    psrlw $3, %xmm5
   3312 ; SSE2-NEXT:    pand %xmm14, %xmm5
   3313 ; SSE2-NEXT:    por %xmm6, %xmm5
   3314 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   3315 ; SSE2-NEXT:    psrlw $5, %xmm6
   3316 ; SSE2-NEXT:    pand %xmm15, %xmm6
   3317 ; SSE2-NEXT:    por %xmm5, %xmm6
   3318 ; SSE2-NEXT:    psrlw $7, %xmm1
   3319 ; SSE2-NEXT:    pand %xmm7, %xmm1
   3320 ; SSE2-NEXT:    por %xmm6, %xmm1
   3321 ; SSE2-NEXT:    por %xmm4, %xmm1
   3322 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   3323 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   3324 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
   3325 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   3326 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   3327 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
   3328 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   3329 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   3330 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   3331 ; SSE2-NEXT:    packuswb %xmm4, %xmm2
   3332 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   3333 ; SSE2-NEXT:    psllw $5, %xmm5
   3334 ; SSE2-NEXT:    pand %xmm10, %xmm5
   3335 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   3336 ; SSE2-NEXT:    psllw $7, %xmm4
   3337 ; SSE2-NEXT:    pand %xmm11, %xmm4
   3338 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   3339 ; SSE2-NEXT:    psllw $3, %xmm6
   3340 ; SSE2-NEXT:    pand %xmm12, %xmm6
   3341 ; SSE2-NEXT:    por %xmm5, %xmm6
   3342 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   3343 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   3344 ; SSE2-NEXT:    pand %xmm8, %xmm5
   3345 ; SSE2-NEXT:    por %xmm6, %xmm5
   3346 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   3347 ; SSE2-NEXT:    psrlw $1, %xmm6
   3348 ; SSE2-NEXT:    pand %xmm13, %xmm6
   3349 ; SSE2-NEXT:    por %xmm5, %xmm6
   3350 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   3351 ; SSE2-NEXT:    psrlw $3, %xmm5
   3352 ; SSE2-NEXT:    pand %xmm14, %xmm5
   3353 ; SSE2-NEXT:    por %xmm6, %xmm5
   3354 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   3355 ; SSE2-NEXT:    psrlw $5, %xmm6
   3356 ; SSE2-NEXT:    pand %xmm15, %xmm6
   3357 ; SSE2-NEXT:    por %xmm5, %xmm6
   3358 ; SSE2-NEXT:    psrlw $7, %xmm2
   3359 ; SSE2-NEXT:    pand %xmm7, %xmm2
   3360 ; SSE2-NEXT:    por %xmm6, %xmm2
   3361 ; SSE2-NEXT:    por %xmm4, %xmm2
   3362 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   3363 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
   3364 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
   3365 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   3366 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   3367 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
   3368 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   3369 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
   3370 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
   3371 ; SSE2-NEXT:    packuswb %xmm4, %xmm3
   3372 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   3373 ; SSE2-NEXT:    psllw $5, %xmm5
   3374 ; SSE2-NEXT:    pand %xmm10, %xmm5
   3375 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   3376 ; SSE2-NEXT:    psllw $7, %xmm4
   3377 ; SSE2-NEXT:    pand %xmm11, %xmm4
   3378 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   3379 ; SSE2-NEXT:    psllw $3, %xmm6
   3380 ; SSE2-NEXT:    pand %xmm12, %xmm6
   3381 ; SSE2-NEXT:    por %xmm5, %xmm6
   3382 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   3383 ; SSE2-NEXT:    paddb %xmm5, %xmm5
   3384 ; SSE2-NEXT:    pand %xmm8, %xmm5
   3385 ; SSE2-NEXT:    por %xmm6, %xmm5
   3386 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   3387 ; SSE2-NEXT:    psrlw $1, %xmm6
   3388 ; SSE2-NEXT:    pand %xmm13, %xmm6
   3389 ; SSE2-NEXT:    por %xmm5, %xmm6
   3390 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   3391 ; SSE2-NEXT:    psrlw $3, %xmm5
   3392 ; SSE2-NEXT:    pand %xmm14, %xmm5
   3393 ; SSE2-NEXT:    por %xmm6, %xmm5
   3394 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   3395 ; SSE2-NEXT:    psrlw $5, %xmm6
   3396 ; SSE2-NEXT:    pand %xmm15, %xmm6
   3397 ; SSE2-NEXT:    por %xmm5, %xmm6
   3398 ; SSE2-NEXT:    psrlw $7, %xmm3
   3399 ; SSE2-NEXT:    pand %xmm7, %xmm3
   3400 ; SSE2-NEXT:    por %xmm6, %xmm3
   3401 ; SSE2-NEXT:    por %xmm4, %xmm3
   3402 ; SSE2-NEXT:    retq
   3403 ;
   3404 ; SSSE3-LABEL: test_bitreverse_v8i64:
   3405 ; SSSE3:       # BB#0:
   3406 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   3407 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
   3408 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   3409 ; SSSE3-NEXT:    pshufb %xmm8, %xmm1
   3410 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   3411 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   3412 ; SSSE3-NEXT:    pand %xmm9, %xmm0
   3413 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   3414 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   3415 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   3416 ; SSSE3-NEXT:    psrlw $4, %xmm1
   3417 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   3418 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   3419 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   3420 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   3421 ; SSSE3-NEXT:    por %xmm6, %xmm0
   3422 ; SSSE3-NEXT:    pshufb %xmm8, %xmm5
   3423 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   3424 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   3425 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   3426 ; SSSE3-NEXT:    pshufb %xmm1, %xmm6
   3427 ; SSSE3-NEXT:    psrlw $4, %xmm5
   3428 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   3429 ; SSSE3-NEXT:    movdqa %xmm4, %xmm1
   3430 ; SSSE3-NEXT:    pshufb %xmm5, %xmm1
   3431 ; SSSE3-NEXT:    por %xmm6, %xmm1
   3432 ; SSSE3-NEXT:    pshufb %xmm8, %xmm2
   3433 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   3434 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   3435 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   3436 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   3437 ; SSSE3-NEXT:    psrlw $4, %xmm2
   3438 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   3439 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   3440 ; SSSE3-NEXT:    pshufb %xmm2, %xmm5
   3441 ; SSSE3-NEXT:    por %xmm6, %xmm5
   3442 ; SSSE3-NEXT:    pshufb %xmm8, %xmm3
   3443 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
   3444 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   3445 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   3446 ; SSSE3-NEXT:    psrlw $4, %xmm3
   3447 ; SSSE3-NEXT:    pand %xmm9, %xmm3
   3448 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   3449 ; SSSE3-NEXT:    por %xmm7, %xmm4
   3450 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
   3451 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   3452 ; SSSE3-NEXT:    retq
   3453 ;
   3454 ; AVX1-LABEL: test_bitreverse_v8i64:
   3455 ; AVX1:       # BB#0:
   3456 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3457 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   3458 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   3459 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   3460 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   3461 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   3462 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   3463 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   3464 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   3465 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   3466 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   3467 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   3468 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   3469 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
   3470 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   3471 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   3472 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   3473 ; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
   3474 ; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
   3475 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   3476 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3477 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   3478 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   3479 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   3480 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   3481 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   3482 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   3483 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   3484 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   3485 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
   3486 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
   3487 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   3488 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   3489 ; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
   3490 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
   3491 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   3492 ; AVX1-NEXT:    retq
   3493 ;
   3494 ; AVX2-LABEL: test_bitreverse_v8i64:
   3495 ; AVX2:       # BB#0:
   3496 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   3497 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   3498 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   3499 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
   3500 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   3501 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   3502 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   3503 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
   3504 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   3505 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   3506 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
   3507 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   3508 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
   3509 ; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   3510 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   3511 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
   3512 ; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   3513 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
   3514 ; AVX2-NEXT:    retq
   3515 ;
   3516 ; AVX512F-LABEL: test_bitreverse_v8i64:
   3517 ; AVX512F:       # BB#0:
   3518 ; AVX512F-NEXT:    vpsllq $61, %zmm0, %zmm1
   3519 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
   3520 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm2
   3521 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3522 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
   3523 ; AVX512F-NEXT:    vpsllq $59, %zmm0, %zmm2
   3524 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3525 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3526 ; AVX512F-NEXT:    vpsllq $57, %zmm0, %zmm2
   3527 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3528 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3529 ; AVX512F-NEXT:    vpsllq $55, %zmm0, %zmm2
   3530 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3531 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3532 ; AVX512F-NEXT:    vpsllq $53, %zmm0, %zmm2
   3533 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3534 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3535 ; AVX512F-NEXT:    vpsllq $51, %zmm0, %zmm2
   3536 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3537 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3538 ; AVX512F-NEXT:    vpsllq $49, %zmm0, %zmm2
   3539 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3540 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3541 ; AVX512F-NEXT:    vpsllq $47, %zmm0, %zmm2
   3542 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3543 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3544 ; AVX512F-NEXT:    vpsllq $45, %zmm0, %zmm2
   3545 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3546 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3547 ; AVX512F-NEXT:    vpsllq $43, %zmm0, %zmm2
   3548 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3549 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3550 ; AVX512F-NEXT:    vpsllq $41, %zmm0, %zmm2
   3551 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3552 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3553 ; AVX512F-NEXT:    vpsllq $39, %zmm0, %zmm2
   3554 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3555 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3556 ; AVX512F-NEXT:    vpsllq $37, %zmm0, %zmm2
   3557 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3558 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3559 ; AVX512F-NEXT:    vpsllq $35, %zmm0, %zmm2
   3560 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3561 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3562 ; AVX512F-NEXT:    vpsllq $33, %zmm0, %zmm2
   3563 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3564 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3565 ; AVX512F-NEXT:    vpsllq $31, %zmm0, %zmm2
   3566 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3567 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3568 ; AVX512F-NEXT:    vpsllq $29, %zmm0, %zmm2
   3569 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3570 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3571 ; AVX512F-NEXT:    vpsllq $27, %zmm0, %zmm2
   3572 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3573 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3574 ; AVX512F-NEXT:    vpsllq $25, %zmm0, %zmm2
   3575 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3576 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3577 ; AVX512F-NEXT:    vpsllq $23, %zmm0, %zmm2
   3578 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3579 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3580 ; AVX512F-NEXT:    vpsllq $21, %zmm0, %zmm2
   3581 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3582 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3583 ; AVX512F-NEXT:    vpsllq $19, %zmm0, %zmm2
   3584 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3585 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3586 ; AVX512F-NEXT:    vpsllq $17, %zmm0, %zmm2
   3587 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3588 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3589 ; AVX512F-NEXT:    vpsllq $15, %zmm0, %zmm2
   3590 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3591 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3592 ; AVX512F-NEXT:    vpsllq $13, %zmm0, %zmm2
   3593 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3594 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3595 ; AVX512F-NEXT:    vpsllq $11, %zmm0, %zmm2
   3596 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3597 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3598 ; AVX512F-NEXT:    vpsllq $9, %zmm0, %zmm2
   3599 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3600 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3601 ; AVX512F-NEXT:    vpsllq $7, %zmm0, %zmm2
   3602 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3603 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3604 ; AVX512F-NEXT:    vpsllq $5, %zmm0, %zmm2
   3605 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3606 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3607 ; AVX512F-NEXT:    vpsllq $3, %zmm0, %zmm2
   3608 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3609 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3610 ; AVX512F-NEXT:    vpsllq $1, %zmm0, %zmm2
   3611 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3612 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3613 ; AVX512F-NEXT:    vpsrlq $1, %zmm0, %zmm2
   3614 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3615 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3616 ; AVX512F-NEXT:    vpsrlq $3, %zmm0, %zmm2
   3617 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3618 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3619 ; AVX512F-NEXT:    vpsrlq $5, %zmm0, %zmm2
   3620 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3621 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3622 ; AVX512F-NEXT:    vpsrlq $7, %zmm0, %zmm2
   3623 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3624 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3625 ; AVX512F-NEXT:    vpsrlq $9, %zmm0, %zmm2
   3626 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3627 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3628 ; AVX512F-NEXT:    vpsrlq $11, %zmm0, %zmm2
   3629 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3630 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3631 ; AVX512F-NEXT:    vpsrlq $13, %zmm0, %zmm2
   3632 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3633 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3634 ; AVX512F-NEXT:    vpsrlq $15, %zmm0, %zmm2
   3635 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3636 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3637 ; AVX512F-NEXT:    vpsrlq $17, %zmm0, %zmm2
   3638 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3639 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3640 ; AVX512F-NEXT:    vpsrlq $19, %zmm0, %zmm2
   3641 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3642 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3643 ; AVX512F-NEXT:    vpsrlq $21, %zmm0, %zmm2
   3644 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3645 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3646 ; AVX512F-NEXT:    vpsrlq $23, %zmm0, %zmm2
   3647 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3648 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3649 ; AVX512F-NEXT:    vpsrlq $25, %zmm0, %zmm2
   3650 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3651 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3652 ; AVX512F-NEXT:    vpsrlq $27, %zmm0, %zmm2
   3653 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3654 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3655 ; AVX512F-NEXT:    vpsrlq $29, %zmm0, %zmm2
   3656 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3657 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3658 ; AVX512F-NEXT:    vpsrlq $31, %zmm0, %zmm2
   3659 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3660 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3661 ; AVX512F-NEXT:    vpsrlq $33, %zmm0, %zmm2
   3662 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3663 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3664 ; AVX512F-NEXT:    vpsrlq $35, %zmm0, %zmm2
   3665 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3666 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3667 ; AVX512F-NEXT:    vpsrlq $37, %zmm0, %zmm2
   3668 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3669 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3670 ; AVX512F-NEXT:    vpsrlq $39, %zmm0, %zmm2
   3671 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3672 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3673 ; AVX512F-NEXT:    vpsrlq $41, %zmm0, %zmm2
   3674 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3675 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3676 ; AVX512F-NEXT:    vpsrlq $43, %zmm0, %zmm2
   3677 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3678 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3679 ; AVX512F-NEXT:    vpsrlq $45, %zmm0, %zmm2
   3680 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3681 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3682 ; AVX512F-NEXT:    vpsrlq $47, %zmm0, %zmm2
   3683 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3684 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3685 ; AVX512F-NEXT:    vpsrlq $49, %zmm0, %zmm2
   3686 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3687 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3688 ; AVX512F-NEXT:    vpsrlq $51, %zmm0, %zmm2
   3689 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3690 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3691 ; AVX512F-NEXT:    vpsrlq $53, %zmm0, %zmm2
   3692 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3693 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3694 ; AVX512F-NEXT:    vpsrlq $55, %zmm0, %zmm2
   3695 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3696 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3697 ; AVX512F-NEXT:    vpsrlq $57, %zmm0, %zmm2
   3698 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3699 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3700 ; AVX512F-NEXT:    vpsrlq $59, %zmm0, %zmm2
   3701 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3702 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3703 ; AVX512F-NEXT:    vpsrlq $61, %zmm0, %zmm2
   3704 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   3705 ; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
   3706 ; AVX512F-NEXT:    vpsrlq $63, %zmm0, %zmm0
   3707 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   3708 ; AVX512F-NEXT:    vporq %zmm0, %zmm1, %zmm0
   3709 ; AVX512F-NEXT:    retq
   3710 ;
   3711 ; AVX512BW-LABEL: test_bitreverse_v8i64:
   3712 ; AVX512BW:       # BB#0:
   3713 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
   3714 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   3715 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   3716 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   3717 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   3718 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   3719 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   3720 ; AVX512BW-NEXT:    vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   3721 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   3722 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   3723 ; AVX512BW-NEXT:    retq
   3724 ;
   3725 ; XOPAVX1-LABEL: test_bitreverse_v8i64:
   3726 ; XOPAVX1:       # BB#0:
   3727 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   3728 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   3729 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3730 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   3731 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   3732 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   3733 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3734 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   3735 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   3736 ; XOPAVX1-NEXT:    retq
   3737 ;
   3738 ; XOPAVX2-LABEL: test_bitreverse_v8i64:
   3739 ; XOPAVX2:       # BB#0:
   3740 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   3741 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   3742 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3743 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   3744 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   3745 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   3746 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   3747 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   3748 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   3749 ; XOPAVX2-NEXT:    retq
   3750   %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
   3751   ret <8 x i64> %b
   3752 }
   3753 
   3754 declare i8 @llvm.bitreverse.i8(i8) readnone
   3755 declare i16 @llvm.bitreverse.i16(i16) readnone
   3756 declare i32 @llvm.bitreverse.i32(i32) readnone
   3757 declare i64 @llvm.bitreverse.i64(i64) readnone
   3758 
   3759 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
   3760 declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
   3761 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
   3762 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
   3763 
   3764 declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
   3765 declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
   3766 declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
   3767 declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
   3768 
   3769 declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
   3770 declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
   3771 declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
   3772 declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
   3773