Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512F
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX  --check-prefix=AVX512  --check-prefix=AVX512BW
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
     10 
     11 ; Make sure we don't crash with avx512bw and xop
     12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
     13 
     14 define i8 @test_bitreverse_i8(i8 %a) nounwind {
     15 ; SSE-LABEL: test_bitreverse_i8:
     16 ; SSE:       # %bb.0:
     17 ; SSE-NEXT:    rolb $4, %dil
     18 ; SSE-NEXT:    movl %edi, %eax
     19 ; SSE-NEXT:    andb $51, %al
     20 ; SSE-NEXT:    shlb $2, %al
     21 ; SSE-NEXT:    andb $-52, %dil
     22 ; SSE-NEXT:    shrb $2, %dil
     23 ; SSE-NEXT:    orb %al, %dil
     24 ; SSE-NEXT:    movl %edi, %eax
     25 ; SSE-NEXT:    andb $85, %al
     26 ; SSE-NEXT:    addb %al, %al
     27 ; SSE-NEXT:    andb $-86, %dil
     28 ; SSE-NEXT:    shrb %dil
     29 ; SSE-NEXT:    orb %al, %dil
     30 ; SSE-NEXT:    movl %edi, %eax
     31 ; SSE-NEXT:    retq
     32 ;
     33 ; AVX-LABEL: test_bitreverse_i8:
     34 ; AVX:       # %bb.0:
     35 ; AVX-NEXT:    rolb $4, %dil
     36 ; AVX-NEXT:    movl %edi, %eax
     37 ; AVX-NEXT:    andb $51, %al
     38 ; AVX-NEXT:    shlb $2, %al
     39 ; AVX-NEXT:    andb $-52, %dil
     40 ; AVX-NEXT:    shrb $2, %dil
     41 ; AVX-NEXT:    orb %al, %dil
     42 ; AVX-NEXT:    movl %edi, %eax
     43 ; AVX-NEXT:    andb $85, %al
     44 ; AVX-NEXT:    addb %al, %al
     45 ; AVX-NEXT:    andb $-86, %dil
     46 ; AVX-NEXT:    shrb %dil
     47 ; AVX-NEXT:    orb %al, %dil
     48 ; AVX-NEXT:    movl %edi, %eax
     49 ; AVX-NEXT:    retq
     50 ;
     51 ; XOP-LABEL: test_bitreverse_i8:
     52 ; XOP:       # %bb.0:
     53 ; XOP-NEXT:    vmovd %edi, %xmm0
     54 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
     55 ; XOP-NEXT:    vpextrb $0, %xmm0, %eax
     56 ; XOP-NEXT:    # kill: def $al killed $al killed $eax
     57 ; XOP-NEXT:    retq
     58   %b = call i8 @llvm.bitreverse.i8(i8 %a)
     59   ret i8 %b
     60 }
     61 
     62 define i16 @test_bitreverse_i16(i16 %a) nounwind {
     63 ; SSE-LABEL: test_bitreverse_i16:
     64 ; SSE:       # %bb.0:
     65 ; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
     66 ; SSE-NEXT:    rolw $8, %di
     67 ; SSE-NEXT:    movl %edi, %eax
     68 ; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
     69 ; SSE-NEXT:    shll $4, %eax
     70 ; SSE-NEXT:    andl $61680, %edi # imm = 0xF0F0
     71 ; SSE-NEXT:    shrl $4, %edi
     72 ; SSE-NEXT:    orl %eax, %edi
     73 ; SSE-NEXT:    movl %edi, %eax
     74 ; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
     75 ; SSE-NEXT:    andl $52428, %edi # imm = 0xCCCC
     76 ; SSE-NEXT:    shrl $2, %edi
     77 ; SSE-NEXT:    leal (%rdi,%rax,4), %eax
     78 ; SSE-NEXT:    movl %eax, %ecx
     79 ; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
     80 ; SSE-NEXT:    andl $43690, %eax # imm = 0xAAAA
     81 ; SSE-NEXT:    shrl %eax
     82 ; SSE-NEXT:    leal (%rax,%rcx,2), %eax
     83 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
     84 ; SSE-NEXT:    retq
     85 ;
     86 ; AVX-LABEL: test_bitreverse_i16:
     87 ; AVX:       # %bb.0:
     88 ; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
     89 ; AVX-NEXT:    rolw $8, %di
     90 ; AVX-NEXT:    movl %edi, %eax
     91 ; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
     92 ; AVX-NEXT:    shll $4, %eax
     93 ; AVX-NEXT:    andl $61680, %edi # imm = 0xF0F0
     94 ; AVX-NEXT:    shrl $4, %edi
     95 ; AVX-NEXT:    orl %eax, %edi
     96 ; AVX-NEXT:    movl %edi, %eax
     97 ; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
     98 ; AVX-NEXT:    andl $52428, %edi # imm = 0xCCCC
     99 ; AVX-NEXT:    shrl $2, %edi
    100 ; AVX-NEXT:    leal (%rdi,%rax,4), %eax
    101 ; AVX-NEXT:    movl %eax, %ecx
    102 ; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
    103 ; AVX-NEXT:    andl $43690, %eax # imm = 0xAAAA
    104 ; AVX-NEXT:    shrl %eax
    105 ; AVX-NEXT:    leal (%rax,%rcx,2), %eax
    106 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
    107 ; AVX-NEXT:    retq
    108 ;
    109 ; XOP-LABEL: test_bitreverse_i16:
    110 ; XOP:       # %bb.0:
    111 ; XOP-NEXT:    vmovd %edi, %xmm0
    112 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    113 ; XOP-NEXT:    vmovd %xmm0, %eax
    114 ; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
    115 ; XOP-NEXT:    retq
    116   %b = call i16 @llvm.bitreverse.i16(i16 %a)
    117   ret i16 %b
    118 }
    119 
    120 define i32 @test_bitreverse_i32(i32 %a) nounwind {
    121 ; SSE-LABEL: test_bitreverse_i32:
    122 ; SSE:       # %bb.0:
    123 ; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
    124 ; SSE-NEXT:    bswapl %edi
    125 ; SSE-NEXT:    movl %edi, %eax
    126 ; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
    127 ; SSE-NEXT:    shll $4, %eax
    128 ; SSE-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
    129 ; SSE-NEXT:    shrl $4, %edi
    130 ; SSE-NEXT:    orl %eax, %edi
    131 ; SSE-NEXT:    movl %edi, %eax
    132 ; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
    133 ; SSE-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
    134 ; SSE-NEXT:    shrl $2, %edi
    135 ; SSE-NEXT:    leal (%rdi,%rax,4), %eax
    136 ; SSE-NEXT:    movl %eax, %ecx
    137 ; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
    138 ; SSE-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
    139 ; SSE-NEXT:    shrl %eax
    140 ; SSE-NEXT:    leal (%rax,%rcx,2), %eax
    141 ; SSE-NEXT:    retq
    142 ;
    143 ; AVX-LABEL: test_bitreverse_i32:
    144 ; AVX:       # %bb.0:
    145 ; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
    146 ; AVX-NEXT:    bswapl %edi
    147 ; AVX-NEXT:    movl %edi, %eax
    148 ; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
    149 ; AVX-NEXT:    shll $4, %eax
    150 ; AVX-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
    151 ; AVX-NEXT:    shrl $4, %edi
    152 ; AVX-NEXT:    orl %eax, %edi
    153 ; AVX-NEXT:    movl %edi, %eax
    154 ; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
    155 ; AVX-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
    156 ; AVX-NEXT:    shrl $2, %edi
    157 ; AVX-NEXT:    leal (%rdi,%rax,4), %eax
    158 ; AVX-NEXT:    movl %eax, %ecx
    159 ; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
    160 ; AVX-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
    161 ; AVX-NEXT:    shrl %eax
    162 ; AVX-NEXT:    leal (%rax,%rcx,2), %eax
    163 ; AVX-NEXT:    retq
    164 ;
    165 ; XOP-LABEL: test_bitreverse_i32:
    166 ; XOP:       # %bb.0:
    167 ; XOP-NEXT:    vmovd %edi, %xmm0
    168 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    169 ; XOP-NEXT:    vmovd %xmm0, %eax
    170 ; XOP-NEXT:    retq
    171   %b = call i32 @llvm.bitreverse.i32(i32 %a)
    172   ret i32 %b
    173 }
    174 
    175 define i64 @test_bitreverse_i64(i64 %a) nounwind {
    176 ; SSE-LABEL: test_bitreverse_i64:
    177 ; SSE:       # %bb.0:
    178 ; SSE-NEXT:    bswapq %rdi
    179 ; SSE-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
    180 ; SSE-NEXT:    andq %rdi, %rax
    181 ; SSE-NEXT:    shlq $4, %rax
    182 ; SSE-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
    183 ; SSE-NEXT:    andq %rdi, %rcx
    184 ; SSE-NEXT:    shrq $4, %rcx
    185 ; SSE-NEXT:    orq %rax, %rcx
    186 ; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
    187 ; SSE-NEXT:    andq %rcx, %rax
    188 ; SSE-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
    189 ; SSE-NEXT:    andq %rcx, %rdx
    190 ; SSE-NEXT:    shrq $2, %rdx
    191 ; SSE-NEXT:    leaq (%rdx,%rax,4), %rax
    192 ; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
    193 ; SSE-NEXT:    andq %rax, %rcx
    194 ; SSE-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
    195 ; SSE-NEXT:    andq %rax, %rdx
    196 ; SSE-NEXT:    shrq %rdx
    197 ; SSE-NEXT:    leaq (%rdx,%rcx,2), %rax
    198 ; SSE-NEXT:    retq
    199 ;
    200 ; AVX-LABEL: test_bitreverse_i64:
    201 ; AVX:       # %bb.0:
    202 ; AVX-NEXT:    bswapq %rdi
    203 ; AVX-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
    204 ; AVX-NEXT:    andq %rdi, %rax
    205 ; AVX-NEXT:    shlq $4, %rax
    206 ; AVX-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
    207 ; AVX-NEXT:    andq %rdi, %rcx
    208 ; AVX-NEXT:    shrq $4, %rcx
    209 ; AVX-NEXT:    orq %rax, %rcx
    210 ; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
    211 ; AVX-NEXT:    andq %rcx, %rax
    212 ; AVX-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
    213 ; AVX-NEXT:    andq %rcx, %rdx
    214 ; AVX-NEXT:    shrq $2, %rdx
    215 ; AVX-NEXT:    leaq (%rdx,%rax,4), %rax
    216 ; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
    217 ; AVX-NEXT:    andq %rax, %rcx
    218 ; AVX-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
    219 ; AVX-NEXT:    andq %rax, %rdx
    220 ; AVX-NEXT:    shrq %rdx
    221 ; AVX-NEXT:    leaq (%rdx,%rcx,2), %rax
    222 ; AVX-NEXT:    retq
    223 ;
    224 ; XOP-LABEL: test_bitreverse_i64:
    225 ; XOP:       # %bb.0:
    226 ; XOP-NEXT:    vmovq %rdi, %xmm0
    227 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    228 ; XOP-NEXT:    vmovq %xmm0, %rax
    229 ; XOP-NEXT:    retq
    230   %b = call i64 @llvm.bitreverse.i64(i64 %a)
    231   ret i64 %b
    232 }
    233 
    234 define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
    235 ; SSE2-LABEL: test_bitreverse_v16i8:
    236 ; SSE2:       # %bb.0:
    237 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    238 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    239 ; SSE2-NEXT:    pand %xmm1, %xmm2
    240 ; SSE2-NEXT:    psllw $4, %xmm2
    241 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
    242 ; SSE2-NEXT:    pand %xmm3, %xmm2
    243 ; SSE2-NEXT:    pand %xmm3, %xmm0
    244 ; SSE2-NEXT:    psrlw $4, %xmm0
    245 ; SSE2-NEXT:    pand %xmm1, %xmm0
    246 ; SSE2-NEXT:    por %xmm2, %xmm0
    247 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    248 ; SSE2-NEXT:    pand %xmm0, %xmm1
    249 ; SSE2-NEXT:    psllw $2, %xmm1
    250 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    251 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    252 ; SSE2-NEXT:    psrlw $2, %xmm0
    253 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    254 ; SSE2-NEXT:    por %xmm1, %xmm0
    255 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
    256 ; SSE2-NEXT:    pand %xmm0, %xmm1
    257 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    258 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    259 ; SSE2-NEXT:    psrlw $1, %xmm0
    260 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    261 ; SSE2-NEXT:    por %xmm1, %xmm0
    262 ; SSE2-NEXT:    retq
    263 ;
    264 ; SSSE3-LABEL: test_bitreverse_v16i8:
    265 ; SSSE3:       # %bb.0:
    266 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    267 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    268 ; SSSE3-NEXT:    pand %xmm1, %xmm2
    269 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    270 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
    271 ; SSSE3-NEXT:    psrlw $4, %xmm0
    272 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    273 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    274 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    275 ; SSSE3-NEXT:    por %xmm3, %xmm1
    276 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    277 ; SSSE3-NEXT:    retq
    278 ;
    279 ; AVX-LABEL: test_bitreverse_v16i8:
    280 ; AVX:       # %bb.0:
    281 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    282 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    283 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    284 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    285 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    286 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    287 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    288 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
    289 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
    290 ; AVX-NEXT:    retq
    291 ;
    292 ; XOP-LABEL: test_bitreverse_v16i8:
    293 ; XOP:       # %bb.0:
    294 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    295 ; XOP-NEXT:    retq
    296   %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
    297   ret <16 x i8> %b
    298 }
    299 
    300 define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
    301 ; SSE2-LABEL: test_bitreverse_v8i16:
    302 ; SSE2:       # %bb.0:
    303 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    304 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    305 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    306 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
    307 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
    308 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    309 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
    310 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
    311 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    312 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    313 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    314 ; SSE2-NEXT:    pand %xmm1, %xmm2
    315 ; SSE2-NEXT:    psllw $4, %xmm2
    316 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
    317 ; SSE2-NEXT:    pand %xmm3, %xmm2
    318 ; SSE2-NEXT:    pand %xmm3, %xmm0
    319 ; SSE2-NEXT:    psrlw $4, %xmm0
    320 ; SSE2-NEXT:    pand %xmm1, %xmm0
    321 ; SSE2-NEXT:    por %xmm2, %xmm0
    322 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    323 ; SSE2-NEXT:    pand %xmm0, %xmm1
    324 ; SSE2-NEXT:    psllw $2, %xmm1
    325 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    326 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    327 ; SSE2-NEXT:    psrlw $2, %xmm0
    328 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    329 ; SSE2-NEXT:    por %xmm1, %xmm0
    330 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
    331 ; SSE2-NEXT:    pand %xmm0, %xmm1
    332 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    333 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    334 ; SSE2-NEXT:    psrlw $1, %xmm0
    335 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    336 ; SSE2-NEXT:    por %xmm1, %xmm0
    337 ; SSE2-NEXT:    retq
    338 ;
    339 ; SSSE3-LABEL: test_bitreverse_v8i16:
    340 ; SSSE3:       # %bb.0:
    341 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
    342 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    343 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    344 ; SSSE3-NEXT:    pand %xmm1, %xmm2
    345 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    346 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
    347 ; SSSE3-NEXT:    psrlw $4, %xmm0
    348 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    349 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    350 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    351 ; SSSE3-NEXT:    por %xmm3, %xmm1
    352 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    353 ; SSSE3-NEXT:    retq
    354 ;
    355 ; AVX-LABEL: test_bitreverse_v8i16:
    356 ; AVX:       # %bb.0:
    357 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
    358 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    359 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    360 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    361 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    362 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    363 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    364 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    365 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
    366 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
    367 ; AVX-NEXT:    retq
    368 ;
    369 ; XOP-LABEL: test_bitreverse_v8i16:
    370 ; XOP:       # %bb.0:
    371 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    372 ; XOP-NEXT:    retq
    373   %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
    374   ret <8 x i16> %b
    375 }
    376 
    377 define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
    378 ; SSE2-LABEL: test_bitreverse_v4i32:
    379 ; SSE2:       # %bb.0:
    380 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    381 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    382 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    383 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
    384 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
    385 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    386 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    387 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    388 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    389 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    390 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    391 ; SSE2-NEXT:    pand %xmm1, %xmm2
    392 ; SSE2-NEXT:    psllw $4, %xmm2
    393 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
    394 ; SSE2-NEXT:    pand %xmm3, %xmm2
    395 ; SSE2-NEXT:    pand %xmm3, %xmm0
    396 ; SSE2-NEXT:    psrlw $4, %xmm0
    397 ; SSE2-NEXT:    pand %xmm1, %xmm0
    398 ; SSE2-NEXT:    por %xmm2, %xmm0
    399 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    400 ; SSE2-NEXT:    pand %xmm0, %xmm1
    401 ; SSE2-NEXT:    psllw $2, %xmm1
    402 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    403 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    404 ; SSE2-NEXT:    psrlw $2, %xmm0
    405 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    406 ; SSE2-NEXT:    por %xmm1, %xmm0
    407 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
    408 ; SSE2-NEXT:    pand %xmm0, %xmm1
    409 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    410 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    411 ; SSE2-NEXT:    psrlw $1, %xmm0
    412 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    413 ; SSE2-NEXT:    por %xmm1, %xmm0
    414 ; SSE2-NEXT:    retq
    415 ;
    416 ; SSSE3-LABEL: test_bitreverse_v4i32:
    417 ; SSSE3:       # %bb.0:
    418 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
    419 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    420 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    421 ; SSSE3-NEXT:    pand %xmm1, %xmm2
    422 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    423 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
    424 ; SSSE3-NEXT:    psrlw $4, %xmm0
    425 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    426 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    427 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    428 ; SSSE3-NEXT:    por %xmm3, %xmm1
    429 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    430 ; SSSE3-NEXT:    retq
    431 ;
    432 ; AVX-LABEL: test_bitreverse_v4i32:
    433 ; AVX:       # %bb.0:
    434 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
    435 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    436 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    437 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    438 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    439 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    440 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    441 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    442 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
    443 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
    444 ; AVX-NEXT:    retq
    445 ;
    446 ; XOP-LABEL: test_bitreverse_v4i32:
    447 ; XOP:       # %bb.0:
    448 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    449 ; XOP-NEXT:    retq
    450   %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
    451   ret <4 x i32> %b
    452 }
    453 
    454 define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
    455 ; SSE2-LABEL: test_bitreverse_v2i64:
    456 ; SSE2:       # %bb.0:
    457 ; SSE2-NEXT:    pxor %xmm1, %xmm1
    458 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    459 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
    460 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
    461 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
    462 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
    463 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    464 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    465 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    466 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    467 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    468 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    469 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    470 ; SSE2-NEXT:    pand %xmm1, %xmm2
    471 ; SSE2-NEXT:    psllw $4, %xmm2
    472 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
    473 ; SSE2-NEXT:    pand %xmm3, %xmm2
    474 ; SSE2-NEXT:    pand %xmm3, %xmm0
    475 ; SSE2-NEXT:    psrlw $4, %xmm0
    476 ; SSE2-NEXT:    pand %xmm1, %xmm0
    477 ; SSE2-NEXT:    por %xmm2, %xmm0
    478 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    479 ; SSE2-NEXT:    pand %xmm0, %xmm1
    480 ; SSE2-NEXT:    psllw $2, %xmm1
    481 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
    482 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    483 ; SSE2-NEXT:    psrlw $2, %xmm0
    484 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    485 ; SSE2-NEXT:    por %xmm1, %xmm0
    486 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
    487 ; SSE2-NEXT:    pand %xmm0, %xmm1
    488 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    489 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    490 ; SSE2-NEXT:    psrlw $1, %xmm0
    491 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
    492 ; SSE2-NEXT:    por %xmm1, %xmm0
    493 ; SSE2-NEXT:    retq
    494 ;
    495 ; SSSE3-LABEL: test_bitreverse_v2i64:
    496 ; SSSE3:       # %bb.0:
    497 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
    498 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    499 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    500 ; SSSE3-NEXT:    pand %xmm1, %xmm2
    501 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    502 ; SSSE3-NEXT:    pshufb %xmm2, %xmm3
    503 ; SSSE3-NEXT:    psrlw $4, %xmm0
    504 ; SSSE3-NEXT:    pand %xmm1, %xmm0
    505 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    506 ; SSSE3-NEXT:    pshufb %xmm0, %xmm1
    507 ; SSSE3-NEXT:    por %xmm3, %xmm1
    508 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    509 ; SSSE3-NEXT:    retq
    510 ;
    511 ; AVX-LABEL: test_bitreverse_v2i64:
    512 ; AVX:       # %bb.0:
    513 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
    514 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    515 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
    516 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    517 ; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
    518 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
    519 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
    520 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    521 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
    522 ; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
    523 ; AVX-NEXT:    retq
    524 ;
    525 ; XOP-LABEL: test_bitreverse_v2i64:
    526 ; XOP:       # %bb.0:
    527 ; XOP-NEXT:    vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
    528 ; XOP-NEXT:    retq
    529   %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
    530   ret <2 x i64> %b
    531 }
    532 
    533 define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
    534 ; SSE2-LABEL: test_bitreverse_v32i8:
    535 ; SSE2:       # %bb.0:
    536 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    537 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    538 ; SSE2-NEXT:    pand %xmm2, %xmm3
    539 ; SSE2-NEXT:    psllw $4, %xmm3
    540 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
    541 ; SSE2-NEXT:    pand %xmm5, %xmm3
    542 ; SSE2-NEXT:    pand %xmm5, %xmm0
    543 ; SSE2-NEXT:    psrlw $4, %xmm0
    544 ; SSE2-NEXT:    pand %xmm2, %xmm0
    545 ; SSE2-NEXT:    por %xmm3, %xmm0
    546 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    547 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
    548 ; SSE2-NEXT:    pand %xmm3, %xmm4
    549 ; SSE2-NEXT:    psllw $2, %xmm4
    550 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
    551 ; SSE2-NEXT:    pand %xmm8, %xmm4
    552 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
    553 ; SSE2-NEXT:    pand %xmm9, %xmm0
    554 ; SSE2-NEXT:    psrlw $2, %xmm0
    555 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
    556 ; SSE2-NEXT:    pand %xmm10, %xmm0
    557 ; SSE2-NEXT:    por %xmm4, %xmm0
    558 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
    559 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
    560 ; SSE2-NEXT:    pand %xmm4, %xmm7
    561 ; SSE2-NEXT:    psrlw $1, %xmm7
    562 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    563 ; SSE2-NEXT:    pand %xmm11, %xmm7
    564 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
    565 ; SSE2-NEXT:    pand %xmm6, %xmm0
    566 ; SSE2-NEXT:    paddb %xmm0, %xmm0
    567 ; SSE2-NEXT:    por %xmm7, %xmm0
    568 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
    569 ; SSE2-NEXT:    pand %xmm2, %xmm7
    570 ; SSE2-NEXT:    psllw $4, %xmm7
    571 ; SSE2-NEXT:    pand %xmm5, %xmm7
    572 ; SSE2-NEXT:    pand %xmm5, %xmm1
    573 ; SSE2-NEXT:    psrlw $4, %xmm1
    574 ; SSE2-NEXT:    pand %xmm2, %xmm1
    575 ; SSE2-NEXT:    por %xmm7, %xmm1
    576 ; SSE2-NEXT:    pand %xmm1, %xmm3
    577 ; SSE2-NEXT:    psllw $2, %xmm3
    578 ; SSE2-NEXT:    pand %xmm8, %xmm3
    579 ; SSE2-NEXT:    pand %xmm9, %xmm1
    580 ; SSE2-NEXT:    psrlw $2, %xmm1
    581 ; SSE2-NEXT:    pand %xmm10, %xmm1
    582 ; SSE2-NEXT:    por %xmm3, %xmm1
    583 ; SSE2-NEXT:    pand %xmm1, %xmm4
    584 ; SSE2-NEXT:    psrlw $1, %xmm4
    585 ; SSE2-NEXT:    pand %xmm11, %xmm4
    586 ; SSE2-NEXT:    pand %xmm6, %xmm1
    587 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    588 ; SSE2-NEXT:    por %xmm4, %xmm1
    589 ; SSE2-NEXT:    retq
    590 ;
    591 ; SSSE3-LABEL: test_bitreverse_v32i8:
    592 ; SSSE3:       # %bb.0:
    593 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    594 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    595 ; SSSE3-NEXT:    pand %xmm4, %xmm2
    596 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    597 ; SSSE3-NEXT:    movdqa %xmm5, %xmm6
    598 ; SSSE3-NEXT:    pshufb %xmm2, %xmm6
    599 ; SSSE3-NEXT:    psrlw $4, %xmm0
    600 ; SSSE3-NEXT:    pand %xmm4, %xmm0
    601 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    602 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
    603 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
    604 ; SSSE3-NEXT:    por %xmm6, %xmm3
    605 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    606 ; SSSE3-NEXT:    pand %xmm4, %xmm0
    607 ; SSSE3-NEXT:    pshufb %xmm0, %xmm5
    608 ; SSSE3-NEXT:    psrlw $4, %xmm1
    609 ; SSSE3-NEXT:    pand %xmm4, %xmm1
    610 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
    611 ; SSSE3-NEXT:    por %xmm5, %xmm2
    612 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    613 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
    614 ; SSSE3-NEXT:    retq
    615 ;
    616 ; AVX1-LABEL: test_bitreverse_v32i8:
    617 ; AVX1:       # %bb.0:
    618 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    619 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    620 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
    621 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    622 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    623 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    624 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
    625 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    626 ; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
    627 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
    628 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
    629 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
    630 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    631 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
    632 ; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
    633 ; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
    634 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    635 ; AVX1-NEXT:    retq
    636 ;
    637 ; AVX2-LABEL: test_bitreverse_v32i8:
    638 ; AVX2:       # %bb.0:
    639 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    640 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    641 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    642 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    643 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    644 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    645 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    646 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
    647 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
    648 ; AVX2-NEXT:    retq
    649 ;
    650 ; AVX512-LABEL: test_bitreverse_v32i8:
    651 ; AVX512:       # %bb.0:
    652 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    653 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
    654 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    655 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    656 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
    657 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
    658 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    659 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
    660 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
    661 ; AVX512-NEXT:    retq
    662 ;
    663 ; XOPAVX1-LABEL: test_bitreverse_v32i8:
    664 ; XOPAVX1:       # %bb.0:
    665 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    666 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
    667 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
    668 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
    669 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    670 ; XOPAVX1-NEXT:    retq
    671 ;
    672 ; XOPAVX2-LABEL: test_bitreverse_v32i8:
    673 ; XOPAVX2:       # %bb.0:
    674 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    675 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
    676 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
    677 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
    678 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    679 ; XOPAVX2-NEXT:    retq
    680   %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
    681   ret <32 x i8> %b
    682 }
    683 
    684 define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
    685 ; SSE2-LABEL: test_bitreverse_v16i16:
    686 ; SSE2:       # %bb.0:
    687 ; SSE2-NEXT:    pxor %xmm4, %xmm4
    688 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    689 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
    690 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
    691 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
    692 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
    693 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
    694 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
    695 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    696 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    697 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    698 ; SSE2-NEXT:    pand %xmm2, %xmm3
    699 ; SSE2-NEXT:    psllw $4, %xmm3
    700 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
    701 ; SSE2-NEXT:    pand %xmm6, %xmm3
    702 ; SSE2-NEXT:    pand %xmm6, %xmm0
    703 ; SSE2-NEXT:    psrlw $4, %xmm0
    704 ; SSE2-NEXT:    pand %xmm2, %xmm0
    705 ; SSE2-NEXT:    por %xmm3, %xmm0
    706 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    707 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
    708 ; SSE2-NEXT:    pand %xmm3, %xmm5
    709 ; SSE2-NEXT:    psllw $2, %xmm5
    710 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
    711 ; SSE2-NEXT:    pand %xmm8, %xmm5
    712 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
    713 ; SSE2-NEXT:    pand %xmm9, %xmm0
    714 ; SSE2-NEXT:    psrlw $2, %xmm0
    715 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
    716 ; SSE2-NEXT:    pand %xmm10, %xmm0
    717 ; SSE2-NEXT:    por %xmm5, %xmm0
    718 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
    719 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
    720 ; SSE2-NEXT:    pand %xmm5, %xmm7
    721 ; SSE2-NEXT:    psrlw $1, %xmm7
    722 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    723 ; SSE2-NEXT:    pand %xmm11, %xmm7
    724 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
    725 ; SSE2-NEXT:    pand %xmm12, %xmm0
    726 ; SSE2-NEXT:    paddb %xmm0, %xmm0
    727 ; SSE2-NEXT:    por %xmm7, %xmm0
    728 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
    729 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
    730 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7]
    731 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6]
    732 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    733 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
    734 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
    735 ; SSE2-NEXT:    packuswb %xmm7, %xmm1
    736 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
    737 ; SSE2-NEXT:    pand %xmm2, %xmm4
    738 ; SSE2-NEXT:    psllw $4, %xmm4
    739 ; SSE2-NEXT:    pand %xmm6, %xmm4
    740 ; SSE2-NEXT:    pand %xmm6, %xmm1
    741 ; SSE2-NEXT:    psrlw $4, %xmm1
    742 ; SSE2-NEXT:    pand %xmm2, %xmm1
    743 ; SSE2-NEXT:    por %xmm4, %xmm1
    744 ; SSE2-NEXT:    pand %xmm1, %xmm3
    745 ; SSE2-NEXT:    psllw $2, %xmm3
    746 ; SSE2-NEXT:    pand %xmm8, %xmm3
    747 ; SSE2-NEXT:    pand %xmm9, %xmm1
    748 ; SSE2-NEXT:    psrlw $2, %xmm1
    749 ; SSE2-NEXT:    pand %xmm10, %xmm1
    750 ; SSE2-NEXT:    por %xmm3, %xmm1
    751 ; SSE2-NEXT:    pand %xmm1, %xmm5
    752 ; SSE2-NEXT:    psrlw $1, %xmm5
    753 ; SSE2-NEXT:    pand %xmm11, %xmm5
    754 ; SSE2-NEXT:    pand %xmm12, %xmm1
    755 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    756 ; SSE2-NEXT:    por %xmm5, %xmm1
    757 ; SSE2-NEXT:    retq
    758 ;
    759 ; SSSE3-LABEL: test_bitreverse_v16i16:
    760 ; SSSE3:       # %bb.0:
    761 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
    762 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
    763 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    764 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    765 ; SSSE3-NEXT:    pand %xmm5, %xmm2
    766 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    767 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
    768 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
    769 ; SSSE3-NEXT:    psrlw $4, %xmm0
    770 ; SSSE3-NEXT:    pand %xmm5, %xmm0
    771 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    772 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
    773 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
    774 ; SSSE3-NEXT:    por %xmm7, %xmm3
    775 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
    776 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    777 ; SSSE3-NEXT:    pand %xmm5, %xmm0
    778 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
    779 ; SSSE3-NEXT:    psrlw $4, %xmm1
    780 ; SSSE3-NEXT:    pand %xmm5, %xmm1
    781 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
    782 ; SSSE3-NEXT:    por %xmm6, %xmm2
    783 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    784 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
    785 ; SSSE3-NEXT:    retq
    786 ;
    787 ; AVX1-LABEL: test_bitreverse_v16i16:
    788 ; AVX1:       # %bb.0:
    789 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    790 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
    791 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    792 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    793 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
    794 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    795 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
    796 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    797 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    798 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    799 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    800 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
    801 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    802 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
    803 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
    804 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    805 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    806 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    807 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
    808 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    809 ; AVX1-NEXT:    retq
    810 ;
    811 ; AVX2-LABEL: test_bitreverse_v16i16:
    812 ; AVX2:       # %bb.0:
    813 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
    814 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    815 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    816 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    817 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    818 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    819 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    820 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    821 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
    822 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
    823 ; AVX2-NEXT:    retq
    824 ;
    825 ; AVX512-LABEL: test_bitreverse_v16i16:
    826 ; AVX512:       # %bb.0:
    827 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
    828 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    829 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
    830 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    831 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    832 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
    833 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
    834 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    835 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
    836 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
    837 ; AVX512-NEXT:    retq
    838 ;
    839 ; XOPAVX1-LABEL: test_bitreverse_v16i16:
    840 ; XOPAVX1:       # %bb.0:
    841 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    842 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
    843 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
    844 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
    845 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    846 ; XOPAVX1-NEXT:    retq
    847 ;
    848 ; XOPAVX2-LABEL: test_bitreverse_v16i16:
    849 ; XOPAVX2:       # %bb.0:
    850 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    851 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
    852 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
    853 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
    854 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
    855 ; XOPAVX2-NEXT:    retq
    856   %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
    857   ret <16 x i16> %b
    858 }
    859 
    860 define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
    861 ; SSE2-LABEL: test_bitreverse_v8i32:
    862 ; SSE2:       # %bb.0:
    863 ; SSE2-NEXT:    pxor %xmm4, %xmm4
    864 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
    865 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
    866 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
    867 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
    868 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
    869 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
    870 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
    871 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
    872 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    873 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
    874 ; SSE2-NEXT:    pand %xmm2, %xmm3
    875 ; SSE2-NEXT:    psllw $4, %xmm3
    876 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
    877 ; SSE2-NEXT:    pand %xmm6, %xmm3
    878 ; SSE2-NEXT:    pand %xmm6, %xmm0
    879 ; SSE2-NEXT:    psrlw $4, %xmm0
    880 ; SSE2-NEXT:    pand %xmm2, %xmm0
    881 ; SSE2-NEXT:    por %xmm3, %xmm0
    882 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
    883 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
    884 ; SSE2-NEXT:    pand %xmm3, %xmm5
    885 ; SSE2-NEXT:    psllw $2, %xmm5
    886 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
    887 ; SSE2-NEXT:    pand %xmm8, %xmm5
    888 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
    889 ; SSE2-NEXT:    pand %xmm9, %xmm0
    890 ; SSE2-NEXT:    psrlw $2, %xmm0
    891 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
    892 ; SSE2-NEXT:    pand %xmm10, %xmm0
    893 ; SSE2-NEXT:    por %xmm5, %xmm0
    894 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
    895 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
    896 ; SSE2-NEXT:    pand %xmm5, %xmm7
    897 ; SSE2-NEXT:    psrlw $1, %xmm7
    898 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
    899 ; SSE2-NEXT:    pand %xmm11, %xmm7
    900 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
    901 ; SSE2-NEXT:    pand %xmm12, %xmm0
    902 ; SSE2-NEXT:    paddb %xmm0, %xmm0
    903 ; SSE2-NEXT:    por %xmm7, %xmm0
    904 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
    905 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
    906 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
    907 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
    908 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    909 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
    910 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
    911 ; SSE2-NEXT:    packuswb %xmm7, %xmm1
    912 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
    913 ; SSE2-NEXT:    pand %xmm2, %xmm4
    914 ; SSE2-NEXT:    psllw $4, %xmm4
    915 ; SSE2-NEXT:    pand %xmm6, %xmm4
    916 ; SSE2-NEXT:    pand %xmm6, %xmm1
    917 ; SSE2-NEXT:    psrlw $4, %xmm1
    918 ; SSE2-NEXT:    pand %xmm2, %xmm1
    919 ; SSE2-NEXT:    por %xmm4, %xmm1
    920 ; SSE2-NEXT:    pand %xmm1, %xmm3
    921 ; SSE2-NEXT:    psllw $2, %xmm3
    922 ; SSE2-NEXT:    pand %xmm8, %xmm3
    923 ; SSE2-NEXT:    pand %xmm9, %xmm1
    924 ; SSE2-NEXT:    psrlw $2, %xmm1
    925 ; SSE2-NEXT:    pand %xmm10, %xmm1
    926 ; SSE2-NEXT:    por %xmm3, %xmm1
    927 ; SSE2-NEXT:    pand %xmm1, %xmm5
    928 ; SSE2-NEXT:    psrlw $1, %xmm5
    929 ; SSE2-NEXT:    pand %xmm11, %xmm5
    930 ; SSE2-NEXT:    pand %xmm12, %xmm1
    931 ; SSE2-NEXT:    paddb %xmm1, %xmm1
    932 ; SSE2-NEXT:    por %xmm5, %xmm1
    933 ; SSE2-NEXT:    retq
    934 ;
    935 ; SSSE3-LABEL: test_bitreverse_v8i32:
    936 ; SSSE3:       # %bb.0:
    937 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
    938 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
    939 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    940 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
    941 ; SSSE3-NEXT:    pand %xmm5, %xmm2
    942 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    943 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
    944 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
    945 ; SSSE3-NEXT:    psrlw $4, %xmm0
    946 ; SSSE3-NEXT:    pand %xmm5, %xmm0
    947 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    948 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
    949 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
    950 ; SSSE3-NEXT:    por %xmm7, %xmm3
    951 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
    952 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
    953 ; SSSE3-NEXT:    pand %xmm5, %xmm0
    954 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
    955 ; SSSE3-NEXT:    psrlw $4, %xmm1
    956 ; SSSE3-NEXT:    pand %xmm5, %xmm1
    957 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
    958 ; SSSE3-NEXT:    por %xmm6, %xmm2
    959 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
    960 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
    961 ; SSSE3-NEXT:    retq
    962 ;
    963 ; AVX1-LABEL: test_bitreverse_v8i32:
    964 ; AVX1:       # %bb.0:
    965 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    966 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
    967 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
    968 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    969 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
    970 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    971 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
    972 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
    973 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
    974 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    975 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
    976 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
    977 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
    978 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
    979 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
    980 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
    981 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
    982 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
    983 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
    984 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    985 ; AVX1-NEXT:    retq
    986 ;
    987 ; AVX2-LABEL: test_bitreverse_v8i32:
    988 ; AVX2:       # %bb.0:
    989 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
    990 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
    991 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
    992 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
    993 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
    994 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
    995 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
    996 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
    997 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
    998 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
    999 ; AVX2-NEXT:    retq
   1000 ;
   1001 ; AVX512-LABEL: test_bitreverse_v8i32:
   1002 ; AVX512:       # %bb.0:
   1003 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
   1004 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1005 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1006 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1007 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1008 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1009 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1010 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1011 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1012 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1013 ; AVX512-NEXT:    retq
   1014 ;
   1015 ; XOPAVX1-LABEL: test_bitreverse_v8i32:
   1016 ; XOPAVX1:       # %bb.0:
   1017 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1018 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   1019 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1020 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1021 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1022 ; XOPAVX1-NEXT:    retq
   1023 ;
   1024 ; XOPAVX2-LABEL: test_bitreverse_v8i32:
   1025 ; XOPAVX2:       # %bb.0:
   1026 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1027 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   1028 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1029 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1030 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1031 ; XOPAVX2-NEXT:    retq
   1032   %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
   1033   ret <8 x i32> %b
   1034 }
   1035 
   1036 define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
   1037 ; SSE2-LABEL: test_bitreverse_v4i64:
   1038 ; SSE2:       # %bb.0:
   1039 ; SSE2-NEXT:    pxor %xmm4, %xmm4
   1040 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
   1041 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
   1042 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   1043 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   1044 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   1045 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
   1046 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1047 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   1048 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
   1049 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
   1050 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1051 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1052 ; SSE2-NEXT:    pand %xmm2, %xmm3
   1053 ; SSE2-NEXT:    psllw $4, %xmm3
   1054 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
   1055 ; SSE2-NEXT:    pand %xmm6, %xmm3
   1056 ; SSE2-NEXT:    pand %xmm6, %xmm0
   1057 ; SSE2-NEXT:    psrlw $4, %xmm0
   1058 ; SSE2-NEXT:    pand %xmm2, %xmm0
   1059 ; SSE2-NEXT:    por %xmm3, %xmm0
   1060 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1061 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1062 ; SSE2-NEXT:    pand %xmm3, %xmm5
   1063 ; SSE2-NEXT:    psllw $2, %xmm5
   1064 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
   1065 ; SSE2-NEXT:    pand %xmm8, %xmm5
   1066 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
   1067 ; SSE2-NEXT:    pand %xmm9, %xmm0
   1068 ; SSE2-NEXT:    psrlw $2, %xmm0
   1069 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
   1070 ; SSE2-NEXT:    pand %xmm10, %xmm0
   1071 ; SSE2-NEXT:    por %xmm5, %xmm0
   1072 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
   1073 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1074 ; SSE2-NEXT:    pand %xmm5, %xmm7
   1075 ; SSE2-NEXT:    psrlw $1, %xmm7
   1076 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1077 ; SSE2-NEXT:    pand %xmm11, %xmm7
   1078 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
   1079 ; SSE2-NEXT:    pand %xmm12, %xmm0
   1080 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   1081 ; SSE2-NEXT:    por %xmm7, %xmm0
   1082 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
   1083 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
   1084 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
   1085 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7]
   1086 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4]
   1087 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
   1088 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   1089 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
   1090 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
   1091 ; SSE2-NEXT:    packuswb %xmm7, %xmm1
   1092 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1093 ; SSE2-NEXT:    pand %xmm2, %xmm4
   1094 ; SSE2-NEXT:    psllw $4, %xmm4
   1095 ; SSE2-NEXT:    pand %xmm6, %xmm4
   1096 ; SSE2-NEXT:    pand %xmm6, %xmm1
   1097 ; SSE2-NEXT:    psrlw $4, %xmm1
   1098 ; SSE2-NEXT:    pand %xmm2, %xmm1
   1099 ; SSE2-NEXT:    por %xmm4, %xmm1
   1100 ; SSE2-NEXT:    pand %xmm1, %xmm3
   1101 ; SSE2-NEXT:    psllw $2, %xmm3
   1102 ; SSE2-NEXT:    pand %xmm8, %xmm3
   1103 ; SSE2-NEXT:    pand %xmm9, %xmm1
   1104 ; SSE2-NEXT:    psrlw $2, %xmm1
   1105 ; SSE2-NEXT:    pand %xmm10, %xmm1
   1106 ; SSE2-NEXT:    por %xmm3, %xmm1
   1107 ; SSE2-NEXT:    pand %xmm1, %xmm5
   1108 ; SSE2-NEXT:    psrlw $1, %xmm5
   1109 ; SSE2-NEXT:    pand %xmm11, %xmm5
   1110 ; SSE2-NEXT:    pand %xmm12, %xmm1
   1111 ; SSE2-NEXT:    paddb %xmm1, %xmm1
   1112 ; SSE2-NEXT:    por %xmm5, %xmm1
   1113 ; SSE2-NEXT:    retq
   1114 ;
   1115 ; SSSE3-LABEL: test_bitreverse_v4i64:
   1116 ; SSSE3:       # %bb.0:
   1117 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   1118 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
   1119 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1120 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
   1121 ; SSSE3-NEXT:    pand %xmm5, %xmm2
   1122 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1123 ; SSSE3-NEXT:    movdqa %xmm6, %xmm7
   1124 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   1125 ; SSSE3-NEXT:    psrlw $4, %xmm0
   1126 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   1127 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1128 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
   1129 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
   1130 ; SSSE3-NEXT:    por %xmm7, %xmm3
   1131 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
   1132 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1133 ; SSSE3-NEXT:    pand %xmm5, %xmm0
   1134 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   1135 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1136 ; SSSE3-NEXT:    pand %xmm5, %xmm1
   1137 ; SSSE3-NEXT:    pshufb %xmm1, %xmm2
   1138 ; SSSE3-NEXT:    por %xmm6, %xmm2
   1139 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
   1140 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
   1141 ; SSSE3-NEXT:    retq
   1142 ;
   1143 ; AVX1-LABEL: test_bitreverse_v4i64:
   1144 ; AVX1:       # %bb.0:
   1145 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1146 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   1147 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1148 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1149 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
   1150 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1151 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   1152 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1153 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   1154 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1155 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   1156 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
   1157 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1158 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
   1159 ; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
   1160 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1161 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   1162 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   1163 ; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
   1164 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1165 ; AVX1-NEXT:    retq
   1166 ;
   1167 ; AVX2-LABEL: test_bitreverse_v4i64:
   1168 ; AVX2:       # %bb.0:
   1169 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
   1170 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1171 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1172 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1173 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1174 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1175 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1176 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1177 ; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1178 ; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1179 ; AVX2-NEXT:    retq
   1180 ;
   1181 ; AVX512-LABEL: test_bitreverse_v4i64:
   1182 ; AVX512:       # %bb.0:
   1183 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
   1184 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1185 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
   1186 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1187 ; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
   1188 ; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1189 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
   1190 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1191 ; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
   1192 ; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
   1193 ; AVX512-NEXT:    retq
   1194 ;
   1195 ; XOPAVX1-LABEL: test_bitreverse_v4i64:
   1196 ; XOPAVX1:       # %bb.0:
   1197 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1198 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   1199 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1200 ; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1201 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1202 ; XOPAVX1-NEXT:    retq
   1203 ;
   1204 ; XOPAVX2-LABEL: test_bitreverse_v4i64:
   1205 ; XOPAVX2:       # %bb.0:
   1206 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1207 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   1208 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
   1209 ; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
   1210 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   1211 ; XOPAVX2-NEXT:    retq
   1212   %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
   1213   ret <4 x i64> %b
   1214 }
   1215 
   1216 define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
   1217 ; SSE2-LABEL: test_bitreverse_v64i8:
   1218 ; SSE2:       # %bb.0:
   1219 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1220 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1221 ; SSE2-NEXT:    pand %xmm13, %xmm5
   1222 ; SSE2-NEXT:    psllw $4, %xmm5
   1223 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
   1224 ; SSE2-NEXT:    pand %xmm7, %xmm5
   1225 ; SSE2-NEXT:    pand %xmm7, %xmm0
   1226 ; SSE2-NEXT:    psrlw $4, %xmm0
   1227 ; SSE2-NEXT:    pand %xmm13, %xmm0
   1228 ; SSE2-NEXT:    por %xmm5, %xmm0
   1229 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1230 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   1231 ; SSE2-NEXT:    pand %xmm5, %xmm6
   1232 ; SSE2-NEXT:    psllw $2, %xmm6
   1233 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
   1234 ; SSE2-NEXT:    pand %xmm8, %xmm6
   1235 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
   1236 ; SSE2-NEXT:    pand %xmm9, %xmm0
   1237 ; SSE2-NEXT:    psrlw $2, %xmm0
   1238 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
   1239 ; SSE2-NEXT:    pand %xmm10, %xmm0
   1240 ; SSE2-NEXT:    por %xmm6, %xmm0
   1241 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
   1242 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1243 ; SSE2-NEXT:    pand %xmm6, %xmm4
   1244 ; SSE2-NEXT:    psrlw $1, %xmm4
   1245 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1246 ; SSE2-NEXT:    pand %xmm11, %xmm4
   1247 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
   1248 ; SSE2-NEXT:    pand %xmm12, %xmm0
   1249 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   1250 ; SSE2-NEXT:    por %xmm4, %xmm0
   1251 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1252 ; SSE2-NEXT:    pand %xmm13, %xmm4
   1253 ; SSE2-NEXT:    psllw $4, %xmm4
   1254 ; SSE2-NEXT:    pand %xmm7, %xmm4
   1255 ; SSE2-NEXT:    pand %xmm7, %xmm1
   1256 ; SSE2-NEXT:    psrlw $4, %xmm1
   1257 ; SSE2-NEXT:    pand %xmm13, %xmm1
   1258 ; SSE2-NEXT:    por %xmm4, %xmm1
   1259 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1260 ; SSE2-NEXT:    pand %xmm5, %xmm4
   1261 ; SSE2-NEXT:    psllw $2, %xmm4
   1262 ; SSE2-NEXT:    pand %xmm8, %xmm4
   1263 ; SSE2-NEXT:    pand %xmm9, %xmm1
   1264 ; SSE2-NEXT:    psrlw $2, %xmm1
   1265 ; SSE2-NEXT:    pand %xmm10, %xmm1
   1266 ; SSE2-NEXT:    por %xmm4, %xmm1
   1267 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   1268 ; SSE2-NEXT:    pand %xmm6, %xmm4
   1269 ; SSE2-NEXT:    psrlw $1, %xmm4
   1270 ; SSE2-NEXT:    pand %xmm11, %xmm4
   1271 ; SSE2-NEXT:    pand %xmm12, %xmm1
   1272 ; SSE2-NEXT:    paddb %xmm1, %xmm1
   1273 ; SSE2-NEXT:    por %xmm4, %xmm1
   1274 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   1275 ; SSE2-NEXT:    pand %xmm13, %xmm4
   1276 ; SSE2-NEXT:    psllw $4, %xmm4
   1277 ; SSE2-NEXT:    pand %xmm7, %xmm4
   1278 ; SSE2-NEXT:    pand %xmm7, %xmm2
   1279 ; SSE2-NEXT:    psrlw $4, %xmm2
   1280 ; SSE2-NEXT:    pand %xmm13, %xmm2
   1281 ; SSE2-NEXT:    por %xmm4, %xmm2
   1282 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   1283 ; SSE2-NEXT:    pand %xmm5, %xmm4
   1284 ; SSE2-NEXT:    psllw $2, %xmm4
   1285 ; SSE2-NEXT:    pand %xmm8, %xmm4
   1286 ; SSE2-NEXT:    pand %xmm9, %xmm2
   1287 ; SSE2-NEXT:    psrlw $2, %xmm2
   1288 ; SSE2-NEXT:    pand %xmm10, %xmm2
   1289 ; SSE2-NEXT:    por %xmm4, %xmm2
   1290 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   1291 ; SSE2-NEXT:    pand %xmm6, %xmm4
   1292 ; SSE2-NEXT:    psrlw $1, %xmm4
   1293 ; SSE2-NEXT:    pand %xmm11, %xmm4
   1294 ; SSE2-NEXT:    pand %xmm12, %xmm2
   1295 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1296 ; SSE2-NEXT:    por %xmm4, %xmm2
   1297 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
   1298 ; SSE2-NEXT:    pand %xmm13, %xmm4
   1299 ; SSE2-NEXT:    psllw $4, %xmm4
   1300 ; SSE2-NEXT:    pand %xmm7, %xmm4
   1301 ; SSE2-NEXT:    pand %xmm7, %xmm3
   1302 ; SSE2-NEXT:    psrlw $4, %xmm3
   1303 ; SSE2-NEXT:    pand %xmm13, %xmm3
   1304 ; SSE2-NEXT:    por %xmm4, %xmm3
   1305 ; SSE2-NEXT:    pand %xmm3, %xmm5
   1306 ; SSE2-NEXT:    psllw $2, %xmm5
   1307 ; SSE2-NEXT:    pand %xmm8, %xmm5
   1308 ; SSE2-NEXT:    pand %xmm9, %xmm3
   1309 ; SSE2-NEXT:    psrlw $2, %xmm3
   1310 ; SSE2-NEXT:    pand %xmm10, %xmm3
   1311 ; SSE2-NEXT:    por %xmm5, %xmm3
   1312 ; SSE2-NEXT:    pand %xmm3, %xmm6
   1313 ; SSE2-NEXT:    psrlw $1, %xmm6
   1314 ; SSE2-NEXT:    pand %xmm11, %xmm6
   1315 ; SSE2-NEXT:    pand %xmm12, %xmm3
   1316 ; SSE2-NEXT:    paddb %xmm3, %xmm3
   1317 ; SSE2-NEXT:    por %xmm6, %xmm3
   1318 ; SSE2-NEXT:    retq
   1319 ;
   1320 ; SSSE3-LABEL: test_bitreverse_v64i8:
   1321 ; SSSE3:       # %bb.0:
   1322 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
   1323 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1324 ; SSSE3-NEXT:    pand %xmm8, %xmm0
   1325 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1326 ; SSSE3-NEXT:    movdqa %xmm9, %xmm6
   1327 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   1328 ; SSSE3-NEXT:    psrlw $4, %xmm5
   1329 ; SSSE3-NEXT:    pand %xmm8, %xmm5
   1330 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1331 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   1332 ; SSSE3-NEXT:    pshufb %xmm5, %xmm0
   1333 ; SSSE3-NEXT:    por %xmm6, %xmm0
   1334 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   1335 ; SSSE3-NEXT:    pand %xmm8, %xmm5
   1336 ; SSSE3-NEXT:    movdqa %xmm9, %xmm6
   1337 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   1338 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1339 ; SSSE3-NEXT:    pand %xmm8, %xmm1
   1340 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   1341 ; SSSE3-NEXT:    pshufb %xmm1, %xmm5
   1342 ; SSSE3-NEXT:    por %xmm6, %xmm5
   1343 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
   1344 ; SSSE3-NEXT:    pand %xmm8, %xmm1
   1345 ; SSSE3-NEXT:    movdqa %xmm9, %xmm7
   1346 ; SSSE3-NEXT:    pshufb %xmm1, %xmm7
   1347 ; SSSE3-NEXT:    psrlw $4, %xmm2
   1348 ; SSSE3-NEXT:    pand %xmm8, %xmm2
   1349 ; SSSE3-NEXT:    movdqa %xmm4, %xmm6
   1350 ; SSSE3-NEXT:    pshufb %xmm2, %xmm6
   1351 ; SSSE3-NEXT:    por %xmm7, %xmm6
   1352 ; SSSE3-NEXT:    movdqa %xmm3, %xmm1
   1353 ; SSSE3-NEXT:    pand %xmm8, %xmm1
   1354 ; SSSE3-NEXT:    pshufb %xmm1, %xmm9
   1355 ; SSSE3-NEXT:    psrlw $4, %xmm3
   1356 ; SSSE3-NEXT:    pand %xmm8, %xmm3
   1357 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   1358 ; SSSE3-NEXT:    por %xmm9, %xmm4
   1359 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   1360 ; SSSE3-NEXT:    movdqa %xmm6, %xmm2
   1361 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   1362 ; SSSE3-NEXT:    retq
   1363 ;
   1364 ; AVX1-LABEL: test_bitreverse_v64i8:
   1365 ; AVX1:       # %bb.0:
   1366 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1367 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1368 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
   1369 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1370 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   1371 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   1372 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   1373 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1374 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
   1375 ; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
   1376 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
   1377 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   1378 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1379 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
   1380 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
   1381 ; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
   1382 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1383 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1384 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
   1385 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   1386 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   1387 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
   1388 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
   1389 ; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
   1390 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
   1391 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
   1392 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1393 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
   1394 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
   1395 ; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
   1396 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1397 ; AVX1-NEXT:    retq
   1398 ;
   1399 ; AVX2-LABEL: test_bitreverse_v64i8:
   1400 ; AVX2:       # %bb.0:
   1401 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1402 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
   1403 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1404 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   1405 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1406 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1407 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1408 ; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
   1409 ; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
   1410 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
   1411 ; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   1412 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   1413 ; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1414 ; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
   1415 ; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
   1416 ; AVX2-NEXT:    retq
   1417 ;
   1418 ; AVX512F-LABEL: test_bitreverse_v64i8:
   1419 ; AVX512F:       # %bb.0:
   1420 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1421 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm3
   1422 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1423 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   1424 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1425 ; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
   1426 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1427 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
   1428 ; AVX512F-NEXT:    vpor %ymm0, %ymm3, %ymm0
   1429 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
   1430 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
   1431 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
   1432 ; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
   1433 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
   1434 ; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
   1435 ; AVX512F-NEXT:    retq
   1436 ;
   1437 ; AVX512BW-LABEL: test_bitreverse_v64i8:
   1438 ; AVX512BW:       # %bb.0:
   1439 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1440 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   1441 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1442 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   1443 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   1444 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1445 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1446 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   1447 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   1448 ; AVX512BW-NEXT:    retq
   1449 ;
   1450 ; XOPAVX1-LABEL: test_bitreverse_v64i8:
   1451 ; XOPAVX1:       # %bb.0:
   1452 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1453 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
   1454 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1455 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   1456 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1457 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1458 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1459 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   1460 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1461 ; XOPAVX1-NEXT:    retq
   1462 ;
   1463 ; XOPAVX2-LABEL: test_bitreverse_v64i8:
   1464 ; XOPAVX2:       # %bb.0:
   1465 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1466 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
   1467 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1468 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   1469 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   1470 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1471 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1472 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   1473 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   1474 ; XOPAVX2-NEXT:    retq
   1475   %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
   1476   ret <64 x i8> %b
   1477 }
   1478 
   1479 define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
   1480 ; SSE2-LABEL: test_bitreverse_v32i16:
   1481 ; SSE2:       # %bb.0:
   1482 ; SSE2-NEXT:    pxor %xmm14, %xmm14
   1483 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1484 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
   1485 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
   1486 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
   1487 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
   1488 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
   1489 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
   1490 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
   1491 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1492 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1493 ; SSE2-NEXT:    pand %xmm8, %xmm5
   1494 ; SSE2-NEXT:    psllw $4, %xmm5
   1495 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
   1496 ; SSE2-NEXT:    pand %xmm4, %xmm5
   1497 ; SSE2-NEXT:    pand %xmm4, %xmm0
   1498 ; SSE2-NEXT:    psrlw $4, %xmm0
   1499 ; SSE2-NEXT:    pand %xmm8, %xmm0
   1500 ; SSE2-NEXT:    por %xmm5, %xmm0
   1501 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1502 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1503 ; SSE2-NEXT:    pand %xmm5, %xmm7
   1504 ; SSE2-NEXT:    psllw $2, %xmm7
   1505 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
   1506 ; SSE2-NEXT:    pand %xmm9, %xmm7
   1507 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
   1508 ; SSE2-NEXT:    pand %xmm10, %xmm0
   1509 ; SSE2-NEXT:    psrlw $2, %xmm0
   1510 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
   1511 ; SSE2-NEXT:    pand %xmm11, %xmm0
   1512 ; SSE2-NEXT:    por %xmm7, %xmm0
   1513 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
   1514 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   1515 ; SSE2-NEXT:    pand %xmm7, %xmm6
   1516 ; SSE2-NEXT:    psrlw $1, %xmm6
   1517 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1518 ; SSE2-NEXT:    pand %xmm12, %xmm6
   1519 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
   1520 ; SSE2-NEXT:    pand %xmm13, %xmm0
   1521 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   1522 ; SSE2-NEXT:    por %xmm6, %xmm0
   1523 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1524 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   1525 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
   1526 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
   1527 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
   1528 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
   1529 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
   1530 ; SSE2-NEXT:    packuswb %xmm6, %xmm1
   1531 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1532 ; SSE2-NEXT:    pand %xmm8, %xmm6
   1533 ; SSE2-NEXT:    psllw $4, %xmm6
   1534 ; SSE2-NEXT:    pand %xmm4, %xmm6
   1535 ; SSE2-NEXT:    pand %xmm4, %xmm1
   1536 ; SSE2-NEXT:    psrlw $4, %xmm1
   1537 ; SSE2-NEXT:    pand %xmm8, %xmm1
   1538 ; SSE2-NEXT:    por %xmm6, %xmm1
   1539 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1540 ; SSE2-NEXT:    pand %xmm5, %xmm6
   1541 ; SSE2-NEXT:    psllw $2, %xmm6
   1542 ; SSE2-NEXT:    pand %xmm9, %xmm6
   1543 ; SSE2-NEXT:    pand %xmm10, %xmm1
   1544 ; SSE2-NEXT:    psrlw $2, %xmm1
   1545 ; SSE2-NEXT:    pand %xmm11, %xmm1
   1546 ; SSE2-NEXT:    por %xmm6, %xmm1
   1547 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1548 ; SSE2-NEXT:    pand %xmm7, %xmm6
   1549 ; SSE2-NEXT:    psrlw $1, %xmm6
   1550 ; SSE2-NEXT:    pand %xmm12, %xmm6
   1551 ; SSE2-NEXT:    pand %xmm13, %xmm1
   1552 ; SSE2-NEXT:    paddb %xmm1, %xmm1
   1553 ; SSE2-NEXT:    por %xmm6, %xmm1
   1554 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1555 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   1556 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
   1557 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
   1558 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
   1559 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
   1560 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
   1561 ; SSE2-NEXT:    packuswb %xmm6, %xmm2
   1562 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1563 ; SSE2-NEXT:    pand %xmm8, %xmm6
   1564 ; SSE2-NEXT:    psllw $4, %xmm6
   1565 ; SSE2-NEXT:    pand %xmm4, %xmm6
   1566 ; SSE2-NEXT:    pand %xmm4, %xmm2
   1567 ; SSE2-NEXT:    psrlw $4, %xmm2
   1568 ; SSE2-NEXT:    pand %xmm8, %xmm2
   1569 ; SSE2-NEXT:    por %xmm6, %xmm2
   1570 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1571 ; SSE2-NEXT:    pand %xmm5, %xmm6
   1572 ; SSE2-NEXT:    psllw $2, %xmm6
   1573 ; SSE2-NEXT:    pand %xmm9, %xmm6
   1574 ; SSE2-NEXT:    pand %xmm10, %xmm2
   1575 ; SSE2-NEXT:    psrlw $2, %xmm2
   1576 ; SSE2-NEXT:    pand %xmm11, %xmm2
   1577 ; SSE2-NEXT:    por %xmm6, %xmm2
   1578 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1579 ; SSE2-NEXT:    pand %xmm7, %xmm6
   1580 ; SSE2-NEXT:    psrlw $1, %xmm6
   1581 ; SSE2-NEXT:    pand %xmm12, %xmm6
   1582 ; SSE2-NEXT:    pand %xmm13, %xmm2
   1583 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1584 ; SSE2-NEXT:    por %xmm6, %xmm2
   1585 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   1586 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   1587 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
   1588 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
   1589 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
   1590 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
   1591 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
   1592 ; SSE2-NEXT:    packuswb %xmm6, %xmm3
   1593 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   1594 ; SSE2-NEXT:    pand %xmm8, %xmm6
   1595 ; SSE2-NEXT:    psllw $4, %xmm6
   1596 ; SSE2-NEXT:    pand %xmm4, %xmm6
   1597 ; SSE2-NEXT:    pand %xmm4, %xmm3
   1598 ; SSE2-NEXT:    psrlw $4, %xmm3
   1599 ; SSE2-NEXT:    pand %xmm8, %xmm3
   1600 ; SSE2-NEXT:    por %xmm6, %xmm3
   1601 ; SSE2-NEXT:    pand %xmm3, %xmm5
   1602 ; SSE2-NEXT:    psllw $2, %xmm5
   1603 ; SSE2-NEXT:    pand %xmm9, %xmm5
   1604 ; SSE2-NEXT:    pand %xmm10, %xmm3
   1605 ; SSE2-NEXT:    psrlw $2, %xmm3
   1606 ; SSE2-NEXT:    pand %xmm11, %xmm3
   1607 ; SSE2-NEXT:    por %xmm5, %xmm3
   1608 ; SSE2-NEXT:    pand %xmm3, %xmm7
   1609 ; SSE2-NEXT:    psrlw $1, %xmm7
   1610 ; SSE2-NEXT:    pand %xmm12, %xmm7
   1611 ; SSE2-NEXT:    pand %xmm13, %xmm3
   1612 ; SSE2-NEXT:    paddb %xmm3, %xmm3
   1613 ; SSE2-NEXT:    por %xmm7, %xmm3
   1614 ; SSE2-NEXT:    retq
   1615 ;
   1616 ; SSSE3-LABEL: test_bitreverse_v32i16:
   1617 ; SSSE3:       # %bb.0:
   1618 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   1619 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
   1620 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1621 ; SSSE3-NEXT:    pshufb %xmm8, %xmm1
   1622 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1623 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1624 ; SSSE3-NEXT:    pand %xmm9, %xmm0
   1625 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1626 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   1627 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   1628 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1629 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   1630 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1631 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   1632 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   1633 ; SSSE3-NEXT:    por %xmm6, %xmm0
   1634 ; SSSE3-NEXT:    pshufb %xmm8, %xmm5
   1635 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   1636 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   1637 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   1638 ; SSSE3-NEXT:    pshufb %xmm1, %xmm6
   1639 ; SSSE3-NEXT:    psrlw $4, %xmm5
   1640 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   1641 ; SSSE3-NEXT:    movdqa %xmm4, %xmm1
   1642 ; SSSE3-NEXT:    pshufb %xmm5, %xmm1
   1643 ; SSSE3-NEXT:    por %xmm6, %xmm1
   1644 ; SSSE3-NEXT:    pshufb %xmm8, %xmm2
   1645 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   1646 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   1647 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   1648 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   1649 ; SSSE3-NEXT:    psrlw $4, %xmm2
   1650 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   1651 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   1652 ; SSSE3-NEXT:    pshufb %xmm2, %xmm5
   1653 ; SSSE3-NEXT:    por %xmm6, %xmm5
   1654 ; SSSE3-NEXT:    pshufb %xmm8, %xmm3
   1655 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
   1656 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   1657 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   1658 ; SSSE3-NEXT:    psrlw $4, %xmm3
   1659 ; SSSE3-NEXT:    pand %xmm9, %xmm3
   1660 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   1661 ; SSSE3-NEXT:    por %xmm7, %xmm4
   1662 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
   1663 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   1664 ; SSSE3-NEXT:    retq
   1665 ;
   1666 ; AVX1-LABEL: test_bitreverse_v32i16:
   1667 ; AVX1:       # %bb.0:
   1668 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1669 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1670 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1671 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1672 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   1673 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1674 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   1675 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   1676 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   1677 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1678 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   1679 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   1680 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1681 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
   1682 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   1683 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1684 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   1685 ; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
   1686 ; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
   1687 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1688 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1689 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1690 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   1691 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   1692 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   1693 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   1694 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   1695 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   1696 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   1697 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
   1698 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
   1699 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   1700 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   1701 ; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
   1702 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
   1703 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1704 ; AVX1-NEXT:    retq
   1705 ;
   1706 ; AVX2-LABEL: test_bitreverse_v32i16:
   1707 ; AVX2:       # %bb.0:
   1708 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1709 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   1710 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1711 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
   1712 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1713 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   1714 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1715 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
   1716 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1717 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   1718 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
   1719 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   1720 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
   1721 ; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   1722 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   1723 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
   1724 ; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   1725 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
   1726 ; AVX2-NEXT:    retq
   1727 ;
   1728 ; AVX512F-LABEL: test_bitreverse_v32i16:
   1729 ; AVX512F:       # %bb.0:
   1730 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
   1731 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   1732 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1733 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm4
   1734 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1735 ; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   1736 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
   1737 ; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
   1738 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1739 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   1740 ; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
   1741 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   1742 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm2
   1743 ; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   1744 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
   1745 ; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
   1746 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   1747 ; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
   1748 ; AVX512F-NEXT:    retq
   1749 ;
   1750 ; AVX512BW-LABEL: test_bitreverse_v32i16:
   1751 ; AVX512BW:       # %bb.0:
   1752 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
   1753 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1754 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   1755 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1756 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   1757 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   1758 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   1759 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1760 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   1761 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   1762 ; AVX512BW-NEXT:    retq
   1763 ;
   1764 ; XOPAVX1-LABEL: test_bitreverse_v32i16:
   1765 ; XOPAVX1:       # %bb.0:
   1766 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1767 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
   1768 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1769 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   1770 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   1771 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1772 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1773 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   1774 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   1775 ; XOPAVX1-NEXT:    retq
   1776 ;
   1777 ; XOPAVX2-LABEL: test_bitreverse_v32i16:
   1778 ; XOPAVX2:       # %bb.0:
   1779 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   1780 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
   1781 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1782 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   1783 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   1784 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   1785 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   1786 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   1787 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   1788 ; XOPAVX2-NEXT:    retq
   1789   %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
   1790   ret <32 x i16> %b
   1791 }
   1792 
   1793 define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
   1794 ; SSE2-LABEL: test_bitreverse_v16i32:
   1795 ; SSE2:       # %bb.0:
   1796 ; SSE2-NEXT:    pxor %xmm14, %xmm14
   1797 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   1798 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
   1799 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   1800 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   1801 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
   1802 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   1803 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
   1804 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
   1805 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1806 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   1807 ; SSE2-NEXT:    pand %xmm8, %xmm5
   1808 ; SSE2-NEXT:    psllw $4, %xmm5
   1809 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
   1810 ; SSE2-NEXT:    pand %xmm4, %xmm5
   1811 ; SSE2-NEXT:    pand %xmm4, %xmm0
   1812 ; SSE2-NEXT:    psrlw $4, %xmm0
   1813 ; SSE2-NEXT:    pand %xmm8, %xmm0
   1814 ; SSE2-NEXT:    por %xmm5, %xmm0
   1815 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   1816 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   1817 ; SSE2-NEXT:    pand %xmm5, %xmm7
   1818 ; SSE2-NEXT:    psllw $2, %xmm7
   1819 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
   1820 ; SSE2-NEXT:    pand %xmm9, %xmm7
   1821 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
   1822 ; SSE2-NEXT:    pand %xmm10, %xmm0
   1823 ; SSE2-NEXT:    psrlw $2, %xmm0
   1824 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
   1825 ; SSE2-NEXT:    pand %xmm11, %xmm0
   1826 ; SSE2-NEXT:    por %xmm7, %xmm0
   1827 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
   1828 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   1829 ; SSE2-NEXT:    pand %xmm7, %xmm6
   1830 ; SSE2-NEXT:    psrlw $1, %xmm6
   1831 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   1832 ; SSE2-NEXT:    pand %xmm12, %xmm6
   1833 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
   1834 ; SSE2-NEXT:    pand %xmm13, %xmm0
   1835 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   1836 ; SSE2-NEXT:    por %xmm6, %xmm0
   1837 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1838 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   1839 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
   1840 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
   1841 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
   1842 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
   1843 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
   1844 ; SSE2-NEXT:    packuswb %xmm6, %xmm1
   1845 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1846 ; SSE2-NEXT:    pand %xmm8, %xmm6
   1847 ; SSE2-NEXT:    psllw $4, %xmm6
   1848 ; SSE2-NEXT:    pand %xmm4, %xmm6
   1849 ; SSE2-NEXT:    pand %xmm4, %xmm1
   1850 ; SSE2-NEXT:    psrlw $4, %xmm1
   1851 ; SSE2-NEXT:    pand %xmm8, %xmm1
   1852 ; SSE2-NEXT:    por %xmm6, %xmm1
   1853 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1854 ; SSE2-NEXT:    pand %xmm5, %xmm6
   1855 ; SSE2-NEXT:    psllw $2, %xmm6
   1856 ; SSE2-NEXT:    pand %xmm9, %xmm6
   1857 ; SSE2-NEXT:    pand %xmm10, %xmm1
   1858 ; SSE2-NEXT:    psrlw $2, %xmm1
   1859 ; SSE2-NEXT:    pand %xmm11, %xmm1
   1860 ; SSE2-NEXT:    por %xmm6, %xmm1
   1861 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   1862 ; SSE2-NEXT:    pand %xmm7, %xmm6
   1863 ; SSE2-NEXT:    psrlw $1, %xmm6
   1864 ; SSE2-NEXT:    pand %xmm12, %xmm6
   1865 ; SSE2-NEXT:    pand %xmm13, %xmm1
   1866 ; SSE2-NEXT:    paddb %xmm1, %xmm1
   1867 ; SSE2-NEXT:    por %xmm6, %xmm1
   1868 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1869 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   1870 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
   1871 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
   1872 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
   1873 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   1874 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   1875 ; SSE2-NEXT:    packuswb %xmm6, %xmm2
   1876 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1877 ; SSE2-NEXT:    pand %xmm8, %xmm6
   1878 ; SSE2-NEXT:    psllw $4, %xmm6
   1879 ; SSE2-NEXT:    pand %xmm4, %xmm6
   1880 ; SSE2-NEXT:    pand %xmm4, %xmm2
   1881 ; SSE2-NEXT:    psrlw $4, %xmm2
   1882 ; SSE2-NEXT:    pand %xmm8, %xmm2
   1883 ; SSE2-NEXT:    por %xmm6, %xmm2
   1884 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1885 ; SSE2-NEXT:    pand %xmm5, %xmm6
   1886 ; SSE2-NEXT:    psllw $2, %xmm6
   1887 ; SSE2-NEXT:    pand %xmm9, %xmm6
   1888 ; SSE2-NEXT:    pand %xmm10, %xmm2
   1889 ; SSE2-NEXT:    psrlw $2, %xmm2
   1890 ; SSE2-NEXT:    pand %xmm11, %xmm2
   1891 ; SSE2-NEXT:    por %xmm6, %xmm2
   1892 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   1893 ; SSE2-NEXT:    pand %xmm7, %xmm6
   1894 ; SSE2-NEXT:    psrlw $1, %xmm6
   1895 ; SSE2-NEXT:    pand %xmm12, %xmm6
   1896 ; SSE2-NEXT:    pand %xmm13, %xmm2
   1897 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   1898 ; SSE2-NEXT:    por %xmm6, %xmm2
   1899 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   1900 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   1901 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
   1902 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
   1903 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
   1904 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
   1905 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
   1906 ; SSE2-NEXT:    packuswb %xmm6, %xmm3
   1907 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   1908 ; SSE2-NEXT:    pand %xmm8, %xmm6
   1909 ; SSE2-NEXT:    psllw $4, %xmm6
   1910 ; SSE2-NEXT:    pand %xmm4, %xmm6
   1911 ; SSE2-NEXT:    pand %xmm4, %xmm3
   1912 ; SSE2-NEXT:    psrlw $4, %xmm3
   1913 ; SSE2-NEXT:    pand %xmm8, %xmm3
   1914 ; SSE2-NEXT:    por %xmm6, %xmm3
   1915 ; SSE2-NEXT:    pand %xmm3, %xmm5
   1916 ; SSE2-NEXT:    psllw $2, %xmm5
   1917 ; SSE2-NEXT:    pand %xmm9, %xmm5
   1918 ; SSE2-NEXT:    pand %xmm10, %xmm3
   1919 ; SSE2-NEXT:    psrlw $2, %xmm3
   1920 ; SSE2-NEXT:    pand %xmm11, %xmm3
   1921 ; SSE2-NEXT:    por %xmm5, %xmm3
   1922 ; SSE2-NEXT:    pand %xmm3, %xmm7
   1923 ; SSE2-NEXT:    psrlw $1, %xmm7
   1924 ; SSE2-NEXT:    pand %xmm12, %xmm7
   1925 ; SSE2-NEXT:    pand %xmm13, %xmm3
   1926 ; SSE2-NEXT:    paddb %xmm3, %xmm3
   1927 ; SSE2-NEXT:    por %xmm7, %xmm3
   1928 ; SSE2-NEXT:    retq
   1929 ;
   1930 ; SSSE3-LABEL: test_bitreverse_v16i32:
   1931 ; SSSE3:       # %bb.0:
   1932 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   1933 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
   1934 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   1935 ; SSSE3-NEXT:    pshufb %xmm8, %xmm1
   1936 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1937 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   1938 ; SSSE3-NEXT:    pand %xmm9, %xmm0
   1939 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1940 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   1941 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   1942 ; SSSE3-NEXT:    psrlw $4, %xmm1
   1943 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   1944 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1945 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   1946 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   1947 ; SSSE3-NEXT:    por %xmm6, %xmm0
   1948 ; SSSE3-NEXT:    pshufb %xmm8, %xmm5
   1949 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   1950 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   1951 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   1952 ; SSSE3-NEXT:    pshufb %xmm1, %xmm6
   1953 ; SSSE3-NEXT:    psrlw $4, %xmm5
   1954 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   1955 ; SSSE3-NEXT:    movdqa %xmm4, %xmm1
   1956 ; SSSE3-NEXT:    pshufb %xmm5, %xmm1
   1957 ; SSSE3-NEXT:    por %xmm6, %xmm1
   1958 ; SSSE3-NEXT:    pshufb %xmm8, %xmm2
   1959 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   1960 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   1961 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   1962 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   1963 ; SSSE3-NEXT:    psrlw $4, %xmm2
   1964 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   1965 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   1966 ; SSSE3-NEXT:    pshufb %xmm2, %xmm5
   1967 ; SSSE3-NEXT:    por %xmm6, %xmm5
   1968 ; SSSE3-NEXT:    pshufb %xmm8, %xmm3
   1969 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
   1970 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   1971 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   1972 ; SSSE3-NEXT:    psrlw $4, %xmm3
   1973 ; SSSE3-NEXT:    pand %xmm9, %xmm3
   1974 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   1975 ; SSSE3-NEXT:    por %xmm7, %xmm4
   1976 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
   1977 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   1978 ; SSSE3-NEXT:    retq
   1979 ;
   1980 ; AVX1-LABEL: test_bitreverse_v16i32:
   1981 ; AVX1:       # %bb.0:
   1982 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1983 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   1984 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   1985 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   1986 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   1987 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   1988 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   1989 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   1990 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   1991 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   1992 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   1993 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   1994 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   1995 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
   1996 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   1997 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   1998 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   1999 ; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
   2000 ; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
   2001 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2002 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2003 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   2004 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   2005 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   2006 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   2007 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2008 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   2009 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   2010 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   2011 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
   2012 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
   2013 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   2014 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   2015 ; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
   2016 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
   2017 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2018 ; AVX1-NEXT:    retq
   2019 ;
   2020 ; AVX2-LABEL: test_bitreverse_v16i32:
   2021 ; AVX2:       # %bb.0:
   2022 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
   2023 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   2024 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2025 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
   2026 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2027 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   2028 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2029 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
   2030 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2031 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   2032 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
   2033 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   2034 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
   2035 ; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   2036 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   2037 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
   2038 ; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   2039 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
   2040 ; AVX2-NEXT:    retq
   2041 ;
   2042 ; AVX512F-LABEL: test_bitreverse_v16i32:
   2043 ; AVX512F:       # %bb.0:
   2044 ; AVX512F-NEXT:    vpsrld $24, %zmm0, %zmm1
   2045 ; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm2
   2046 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
   2047 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
   2048 ; AVX512F-NEXT:    vpslld $24, %zmm0, %zmm2
   2049 ; AVX512F-NEXT:    vpslld $8, %zmm0, %zmm0
   2050 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   2051 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2052 ; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
   2053 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
   2054 ; AVX512F-NEXT:    vpslld $4, %zmm1, %zmm1
   2055 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   2056 ; AVX512F-NEXT:    vpsrld $4, %zmm0, %zmm0
   2057 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2058 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
   2059 ; AVX512F-NEXT:    vpslld $2, %zmm1, %zmm1
   2060 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   2061 ; AVX512F-NEXT:    vpsrld $2, %zmm0, %zmm0
   2062 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2063 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
   2064 ; AVX512F-NEXT:    vpslld $1, %zmm1, %zmm1
   2065 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
   2066 ; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
   2067 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2068 ; AVX512F-NEXT:    retq
   2069 ;
   2070 ; AVX512BW-LABEL: test_bitreverse_v16i32:
   2071 ; AVX512BW:       # %bb.0:
   2072 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
   2073 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2074 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   2075 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2076 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   2077 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   2078 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   2079 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2080 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   2081 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   2082 ; AVX512BW-NEXT:    retq
   2083 ;
   2084 ; XOPAVX1-LABEL: test_bitreverse_v16i32:
   2085 ; XOPAVX1:       # %bb.0:
   2086 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2087 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   2088 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2089 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2090 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2091 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2092 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2093 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2094 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2095 ; XOPAVX1-NEXT:    retq
   2096 ;
   2097 ; XOPAVX2-LABEL: test_bitreverse_v16i32:
   2098 ; XOPAVX2:       # %bb.0:
   2099 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2100 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
   2101 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2102 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2103 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   2104 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2105 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2106 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2107 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   2108 ; XOPAVX2-NEXT:    retq
   2109   %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
   2110   ret <16 x i32> %b
   2111 }
   2112 
   2113 define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
   2114 ; SSE2-LABEL: test_bitreverse_v8i64:
   2115 ; SSE2:       # %bb.0:
   2116 ; SSE2-NEXT:    pxor %xmm14, %xmm14
   2117 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
   2118 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
   2119 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
   2120 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
   2121 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
   2122 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
   2123 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   2124 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
   2125 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
   2126 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
   2127 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2128 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2129 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2130 ; SSE2-NEXT:    psllw $4, %xmm5
   2131 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
   2132 ; SSE2-NEXT:    pand %xmm4, %xmm5
   2133 ; SSE2-NEXT:    pand %xmm4, %xmm0
   2134 ; SSE2-NEXT:    psrlw $4, %xmm0
   2135 ; SSE2-NEXT:    pand %xmm8, %xmm0
   2136 ; SSE2-NEXT:    por %xmm5, %xmm0
   2137 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
   2138 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   2139 ; SSE2-NEXT:    pand %xmm5, %xmm7
   2140 ; SSE2-NEXT:    psllw $2, %xmm7
   2141 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
   2142 ; SSE2-NEXT:    pand %xmm9, %xmm7
   2143 ; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
   2144 ; SSE2-NEXT:    pand %xmm10, %xmm0
   2145 ; SSE2-NEXT:    psrlw $2, %xmm0
   2146 ; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
   2147 ; SSE2-NEXT:    pand %xmm11, %xmm0
   2148 ; SSE2-NEXT:    por %xmm7, %xmm0
   2149 ; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
   2150 ; SSE2-NEXT:    movdqa %xmm0, %xmm6
   2151 ; SSE2-NEXT:    pand %xmm7, %xmm6
   2152 ; SSE2-NEXT:    psrlw $1, %xmm6
   2153 ; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
   2154 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2155 ; SSE2-NEXT:    movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
   2156 ; SSE2-NEXT:    pand %xmm13, %xmm0
   2157 ; SSE2-NEXT:    paddb %xmm0, %xmm0
   2158 ; SSE2-NEXT:    por %xmm6, %xmm0
   2159 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2160 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   2161 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
   2162 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
   2163 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
   2164 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
   2165 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2166 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
   2167 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
   2168 ; SSE2-NEXT:    packuswb %xmm6, %xmm1
   2169 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2170 ; SSE2-NEXT:    pand %xmm8, %xmm6
   2171 ; SSE2-NEXT:    psllw $4, %xmm6
   2172 ; SSE2-NEXT:    pand %xmm4, %xmm6
   2173 ; SSE2-NEXT:    pand %xmm4, %xmm1
   2174 ; SSE2-NEXT:    psrlw $4, %xmm1
   2175 ; SSE2-NEXT:    pand %xmm8, %xmm1
   2176 ; SSE2-NEXT:    por %xmm6, %xmm1
   2177 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2178 ; SSE2-NEXT:    pand %xmm5, %xmm6
   2179 ; SSE2-NEXT:    psllw $2, %xmm6
   2180 ; SSE2-NEXT:    pand %xmm9, %xmm6
   2181 ; SSE2-NEXT:    pand %xmm10, %xmm1
   2182 ; SSE2-NEXT:    psrlw $2, %xmm1
   2183 ; SSE2-NEXT:    pand %xmm11, %xmm1
   2184 ; SSE2-NEXT:    por %xmm6, %xmm1
   2185 ; SSE2-NEXT:    movdqa %xmm1, %xmm6
   2186 ; SSE2-NEXT:    pand %xmm7, %xmm6
   2187 ; SSE2-NEXT:    psrlw $1, %xmm6
   2188 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2189 ; SSE2-NEXT:    pand %xmm13, %xmm1
   2190 ; SSE2-NEXT:    paddb %xmm1, %xmm1
   2191 ; SSE2-NEXT:    por %xmm6, %xmm1
   2192 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2193 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   2194 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
   2195 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
   2196 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
   2197 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
   2198 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   2199 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
   2200 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
   2201 ; SSE2-NEXT:    packuswb %xmm6, %xmm2
   2202 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2203 ; SSE2-NEXT:    pand %xmm8, %xmm6
   2204 ; SSE2-NEXT:    psllw $4, %xmm6
   2205 ; SSE2-NEXT:    pand %xmm4, %xmm6
   2206 ; SSE2-NEXT:    pand %xmm4, %xmm2
   2207 ; SSE2-NEXT:    psrlw $4, %xmm2
   2208 ; SSE2-NEXT:    pand %xmm8, %xmm2
   2209 ; SSE2-NEXT:    por %xmm6, %xmm2
   2210 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2211 ; SSE2-NEXT:    pand %xmm5, %xmm6
   2212 ; SSE2-NEXT:    psllw $2, %xmm6
   2213 ; SSE2-NEXT:    pand %xmm9, %xmm6
   2214 ; SSE2-NEXT:    pand %xmm10, %xmm2
   2215 ; SSE2-NEXT:    psrlw $2, %xmm2
   2216 ; SSE2-NEXT:    pand %xmm11, %xmm2
   2217 ; SSE2-NEXT:    por %xmm6, %xmm2
   2218 ; SSE2-NEXT:    movdqa %xmm2, %xmm6
   2219 ; SSE2-NEXT:    pand %xmm7, %xmm6
   2220 ; SSE2-NEXT:    psrlw $1, %xmm6
   2221 ; SSE2-NEXT:    pand %xmm12, %xmm6
   2222 ; SSE2-NEXT:    pand %xmm13, %xmm2
   2223 ; SSE2-NEXT:    paddb %xmm2, %xmm2
   2224 ; SSE2-NEXT:    por %xmm6, %xmm2
   2225 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2226 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15]
   2227 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
   2228 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
   2229 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
   2230 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
   2231 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   2232 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
   2233 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
   2234 ; SSE2-NEXT:    packuswb %xmm6, %xmm3
   2235 ; SSE2-NEXT:    movdqa %xmm3, %xmm6
   2236 ; SSE2-NEXT:    pand %xmm8, %xmm6
   2237 ; SSE2-NEXT:    psllw $4, %xmm6
   2238 ; SSE2-NEXT:    pand %xmm4, %xmm6
   2239 ; SSE2-NEXT:    pand %xmm4, %xmm3
   2240 ; SSE2-NEXT:    psrlw $4, %xmm3
   2241 ; SSE2-NEXT:    pand %xmm8, %xmm3
   2242 ; SSE2-NEXT:    por %xmm6, %xmm3
   2243 ; SSE2-NEXT:    pand %xmm3, %xmm5
   2244 ; SSE2-NEXT:    psllw $2, %xmm5
   2245 ; SSE2-NEXT:    pand %xmm9, %xmm5
   2246 ; SSE2-NEXT:    pand %xmm10, %xmm3
   2247 ; SSE2-NEXT:    psrlw $2, %xmm3
   2248 ; SSE2-NEXT:    pand %xmm11, %xmm3
   2249 ; SSE2-NEXT:    por %xmm5, %xmm3
   2250 ; SSE2-NEXT:    pand %xmm3, %xmm7
   2251 ; SSE2-NEXT:    psrlw $1, %xmm7
   2252 ; SSE2-NEXT:    pand %xmm12, %xmm7
   2253 ; SSE2-NEXT:    pand %xmm13, %xmm3
   2254 ; SSE2-NEXT:    paddb %xmm3, %xmm3
   2255 ; SSE2-NEXT:    por %xmm7, %xmm3
   2256 ; SSE2-NEXT:    retq
   2257 ;
   2258 ; SSSE3-LABEL: test_bitreverse_v8i64:
   2259 ; SSSE3:       # %bb.0:
   2260 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
   2261 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
   2262 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   2263 ; SSSE3-NEXT:    pshufb %xmm8, %xmm1
   2264 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2265 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
   2266 ; SSSE3-NEXT:    pand %xmm9, %xmm0
   2267 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2268 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2269 ; SSSE3-NEXT:    pshufb %xmm0, %xmm6
   2270 ; SSSE3-NEXT:    psrlw $4, %xmm1
   2271 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   2272 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2273 ; SSSE3-NEXT:    movdqa %xmm4, %xmm0
   2274 ; SSSE3-NEXT:    pshufb %xmm1, %xmm0
   2275 ; SSSE3-NEXT:    por %xmm6, %xmm0
   2276 ; SSSE3-NEXT:    pshufb %xmm8, %xmm5
   2277 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
   2278 ; SSSE3-NEXT:    pand %xmm9, %xmm1
   2279 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2280 ; SSSE3-NEXT:    pshufb %xmm1, %xmm6
   2281 ; SSSE3-NEXT:    psrlw $4, %xmm5
   2282 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   2283 ; SSSE3-NEXT:    movdqa %xmm4, %xmm1
   2284 ; SSSE3-NEXT:    pshufb %xmm5, %xmm1
   2285 ; SSSE3-NEXT:    por %xmm6, %xmm1
   2286 ; SSSE3-NEXT:    pshufb %xmm8, %xmm2
   2287 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
   2288 ; SSSE3-NEXT:    pand %xmm9, %xmm5
   2289 ; SSSE3-NEXT:    movdqa %xmm7, %xmm6
   2290 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
   2291 ; SSSE3-NEXT:    psrlw $4, %xmm2
   2292 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   2293 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
   2294 ; SSSE3-NEXT:    pshufb %xmm2, %xmm5
   2295 ; SSSE3-NEXT:    por %xmm6, %xmm5
   2296 ; SSSE3-NEXT:    pshufb %xmm8, %xmm3
   2297 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
   2298 ; SSSE3-NEXT:    pand %xmm9, %xmm2
   2299 ; SSSE3-NEXT:    pshufb %xmm2, %xmm7
   2300 ; SSSE3-NEXT:    psrlw $4, %xmm3
   2301 ; SSSE3-NEXT:    pand %xmm9, %xmm3
   2302 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
   2303 ; SSSE3-NEXT:    por %xmm7, %xmm4
   2304 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
   2305 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
   2306 ; SSSE3-NEXT:    retq
   2307 ;
   2308 ; AVX1-LABEL: test_bitreverse_v8i64:
   2309 ; AVX1:       # %bb.0:
   2310 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2311 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   2312 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   2313 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2314 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   2315 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2316 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   2317 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   2318 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2319 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2320 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   2321 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   2322 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
   2323 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
   2324 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   2325 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
   2326 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2327 ; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
   2328 ; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
   2329 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2330 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2331 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
   2332 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
   2333 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
   2334 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
   2335 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2336 ; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
   2337 ; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
   2338 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
   2339 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
   2340 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
   2341 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
   2342 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   2343 ; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
   2344 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
   2345 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2346 ; AVX1-NEXT:    retq
   2347 ;
   2348 ; AVX2-LABEL: test_bitreverse_v8i64:
   2349 ; AVX2:       # %bb.0:
   2350 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
   2351 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
   2352 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2353 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
   2354 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2355 ; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
   2356 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
   2357 ; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
   2358 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2359 ; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
   2360 ; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
   2361 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
   2362 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
   2363 ; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
   2364 ; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
   2365 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
   2366 ; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
   2367 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
   2368 ; AVX2-NEXT:    retq
   2369 ;
   2370 ; AVX512F-LABEL: test_bitreverse_v8i64:
   2371 ; AVX512F:       # %bb.0:
   2372 ; AVX512F-NEXT:    vpsrlq $56, %zmm0, %zmm1
   2373 ; AVX512F-NEXT:    vpsrlq $40, %zmm0, %zmm2
   2374 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   2375 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
   2376 ; AVX512F-NEXT:    vpsrlq $24, %zmm0, %zmm2
   2377 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   2378 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
   2379 ; AVX512F-NEXT:    vpsrlq $8, %zmm0, %zmm2
   2380 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   2381 ; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
   2382 ; AVX512F-NEXT:    vpsllq $8, %zmm0, %zmm2
   2383 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
   2384 ; AVX512F-NEXT:    vpsllq $24, %zmm0, %zmm3
   2385 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3
   2386 ; AVX512F-NEXT:    vporq %zmm2, %zmm3, %zmm2
   2387 ; AVX512F-NEXT:    vpsllq $56, %zmm0, %zmm3
   2388 ; AVX512F-NEXT:    vpsllq $40, %zmm0, %zmm0
   2389 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   2390 ; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
   2391 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2392 ; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
   2393 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
   2394 ; AVX512F-NEXT:    vpsllq $4, %zmm1, %zmm1
   2395 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   2396 ; AVX512F-NEXT:    vpsrlq $4, %zmm0, %zmm0
   2397 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2398 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
   2399 ; AVX512F-NEXT:    vpsllq $2, %zmm1, %zmm1
   2400 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   2401 ; AVX512F-NEXT:    vpsrlq $2, %zmm0, %zmm0
   2402 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2403 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
   2404 ; AVX512F-NEXT:    vpsllq $1, %zmm1, %zmm1
   2405 ; AVX512F-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
   2406 ; AVX512F-NEXT:    vpsrlq $1, %zmm0, %zmm0
   2407 ; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
   2408 ; AVX512F-NEXT:    retq
   2409 ;
   2410 ; AVX512BW-LABEL: test_bitreverse_v8i64:
   2411 ; AVX512BW:       # %bb.0:
   2412 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
   2413 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
   2414 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
   2415 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
   2416 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
   2417 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
   2418 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
   2419 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
   2420 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
   2421 ; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
   2422 ; AVX512BW-NEXT:    retq
   2423 ;
   2424 ; XOPAVX1-LABEL: test_bitreverse_v8i64:
   2425 ; XOPAVX1:       # %bb.0:
   2426 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   2427 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   2428 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2429 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2430 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2431 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   2432 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2433 ; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2434 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
   2435 ; XOPAVX1-NEXT:    retq
   2436 ;
   2437 ; XOPAVX2-LABEL: test_bitreverse_v8i64:
   2438 ; XOPAVX2:       # %bb.0:
   2439 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2440 ; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
   2441 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2442 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
   2443 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
   2444 ; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2445 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
   2446 ; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
   2447 ; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
   2448 ; XOPAVX2-NEXT:    retq
   2449   %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
   2450   ret <8 x i64> %b
   2451 }
   2452 
   2453 ;
   2454 ; Constant Folding
   2455 ;
   2456 
   2457 define i32 @fold_bitreverse_i32() nounwind {
   2458 ; ALL-LABEL: fold_bitreverse_i32:
   2459 ; ALL:       # %bb.0:
   2460 ; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
   2461 ; ALL-NEXT:    retq
   2462   %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
   2463   ret i32 %b
   2464 }
   2465 
   2466 define <16 x i8> @fold_bitreverse_v16i8() nounwind {
   2467 ; SSE-LABEL: fold_bitreverse_v16i8:
   2468 ; SSE:       # %bb.0:
   2469 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
   2470 ; SSE-NEXT:    retq
   2471 ;
   2472 ; AVX-LABEL: fold_bitreverse_v16i8:
   2473 ; AVX:       # %bb.0:
   2474 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
   2475 ; AVX-NEXT:    retq
   2476 ;
   2477 ; XOP-LABEL: fold_bitreverse_v16i8:
   2478 ; XOP:       # %bb.0:
   2479 ; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
   2480 ; XOP-NEXT:    retq
   2481   %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
   2482   ret <16 x i8> %b
   2483 }
   2484 
   2485 define <16 x i16> @fold_bitreverse_v16i16() nounwind {
   2486 ; SSE-LABEL: fold_bitreverse_v16i16:
   2487 ; SSE:       # %bb.0:
   2488 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
   2489 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
   2490 ; SSE-NEXT:    retq
   2491 ;
   2492 ; AVX-LABEL: fold_bitreverse_v16i16:
   2493 ; AVX:       # %bb.0:
   2494 ; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
   2495 ; AVX-NEXT:    retq
   2496 ;
   2497 ; XOP-LABEL: fold_bitreverse_v16i16:
   2498 ; XOP:       # %bb.0:
   2499 ; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
   2500 ; XOP-NEXT:    retq
   2501   %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
   2502   ret <16 x i16> %b
   2503 }
   2504 
   2505 define <16 x i32> @fold_bitreverse_v16i32() nounwind {
   2506 ; SSE-LABEL: fold_bitreverse_v16i32:
   2507 ; SSE:       # %bb.0:
   2508 ; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
   2509 ; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
   2510 ; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
   2511 ; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
   2512 ; SSE-NEXT:    retq
   2513 ;
   2514 ; AVX1-LABEL: fold_bitreverse_v16i32:
   2515 ; AVX1:       # %bb.0:
   2516 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
   2517 ; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
   2518 ; AVX1-NEXT:    retq
   2519 ;
   2520 ; AVX2-LABEL: fold_bitreverse_v16i32:
   2521 ; AVX2:       # %bb.0:
   2522 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
   2523 ; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
   2524 ; AVX2-NEXT:    retq
   2525 ;
   2526 ; AVX512-LABEL: fold_bitreverse_v16i32:
   2527 ; AVX512:       # %bb.0:
   2528 ; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
   2529 ; AVX512-NEXT:    retq
   2530 ;
   2531 ; XOP-LABEL: fold_bitreverse_v16i32:
   2532 ; XOP:       # %bb.0:
   2533 ; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
   2534 ; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
   2535 ; XOP-NEXT:    retq
   2536   %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
   2537   ret <16 x i32> %b
   2538 }
   2539 
   2540 declare i8 @llvm.bitreverse.i8(i8) readnone
   2541 declare i16 @llvm.bitreverse.i16(i16) readnone
   2542 declare i32 @llvm.bitreverse.i32(i32) readnone
   2543 declare i64 @llvm.bitreverse.i64(i64) readnone
   2544 
   2545 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
   2546 declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
   2547 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
   2548 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
   2549 
   2550 declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
   2551 declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
   2552 declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
   2553 declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
   2554 
   2555 declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
   2556 declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
   2557 declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
   2558 declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
   2559