1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10 11 ; Make sure we don't crash with avx512bw and xop 12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw 13 14 define i8 @test_bitreverse_i8(i8 %a) nounwind { 15 ; SSE-LABEL: test_bitreverse_i8: 16 ; SSE: # %bb.0: 17 ; SSE-NEXT: rolb $4, %dil 18 ; SSE-NEXT: movl %edi, %eax 19 ; SSE-NEXT: andb $51, %al 20 ; SSE-NEXT: shlb $2, %al 21 ; SSE-NEXT: andb $-52, %dil 22 ; SSE-NEXT: shrb $2, %dil 23 ; SSE-NEXT: orb %al, %dil 24 ; SSE-NEXT: movl %edi, %eax 25 ; SSE-NEXT: andb $85, %al 26 ; SSE-NEXT: addb %al, %al 27 ; SSE-NEXT: andb $-86, %dil 28 ; SSE-NEXT: shrb %dil 29 ; SSE-NEXT: orb %al, %dil 30 ; SSE-NEXT: movl %edi, %eax 31 ; SSE-NEXT: retq 32 ; 33 ; AVX-LABEL: test_bitreverse_i8: 34 ; AVX: # %bb.0: 35 ; AVX-NEXT: rolb $4, %dil 36 ; AVX-NEXT: movl %edi, %eax 37 ; AVX-NEXT: andb $51, %al 38 ; AVX-NEXT: shlb $2, %al 39 ; AVX-NEXT: andb $-52, %dil 40 ; AVX-NEXT: shrb $2, %dil 41 ; AVX-NEXT: orb %al, %dil 42 ; AVX-NEXT: movl %edi, %eax 43 ; AVX-NEXT: andb $85, %al 44 ; AVX-NEXT: addb %al, %al 45 ; AVX-NEXT: andb $-86, %dil 46 ; AVX-NEXT: shrb %dil 47 ; AVX-NEXT: orb %al, %dil 48 ; AVX-NEXT: movl %edi, %eax 49 ; AVX-NEXT: retq 50 ; 51 ; XOP-LABEL: test_bitreverse_i8: 52 ; XOP: # %bb.0: 53 ; XOP-NEXT: vmovd %edi, %xmm0 54 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 55 ; XOP-NEXT: vpextrb $0, %xmm0, %eax 56 ; XOP-NEXT: # kill: def $al killed $al killed $eax 57 ; XOP-NEXT: retq 58 %b = call i8 @llvm.bitreverse.i8(i8 %a) 59 ret i8 %b 60 } 61 62 define i16 @test_bitreverse_i16(i16 %a) nounwind { 63 ; SSE-LABEL: test_bitreverse_i16: 64 ; SSE: # %bb.0: 65 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi 66 ; SSE-NEXT: rolw $8, %di 67 ; SSE-NEXT: movl %edi, %eax 68 ; SSE-NEXT: andl $3855, %eax # imm = 0xF0F 69 ; SSE-NEXT: shll $4, %eax 70 ; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0 71 ; SSE-NEXT: shrl $4, %edi 72 ; SSE-NEXT: orl %eax, %edi 73 ; SSE-NEXT: movl %edi, %eax 74 ; SSE-NEXT: andl $13107, %eax # imm = 0x3333 75 ; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC 76 ; SSE-NEXT: shrl $2, %edi 77 ; SSE-NEXT: leal (%rdi,%rax,4), %eax 78 ; SSE-NEXT: movl %eax, %ecx 79 ; SSE-NEXT: andl $21845, %ecx # imm = 0x5555 80 ; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA 81 ; SSE-NEXT: shrl %eax 82 ; SSE-NEXT: leal (%rax,%rcx,2), %eax 83 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax 84 ; SSE-NEXT: retq 85 ; 86 ; AVX-LABEL: test_bitreverse_i16: 87 ; AVX: # %bb.0: 88 ; AVX-NEXT: # kill: def $edi killed $edi def $rdi 89 ; AVX-NEXT: rolw $8, %di 90 ; AVX-NEXT: movl %edi, %eax 91 ; AVX-NEXT: andl $3855, %eax # imm = 0xF0F 92 ; AVX-NEXT: shll $4, %eax 93 ; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0 94 ; AVX-NEXT: shrl $4, %edi 95 ; AVX-NEXT: orl %eax, %edi 96 ; AVX-NEXT: movl %edi, %eax 97 ; AVX-NEXT: andl $13107, %eax # imm = 0x3333 98 ; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC 99 ; AVX-NEXT: shrl $2, %edi 100 ; AVX-NEXT: leal (%rdi,%rax,4), %eax 101 ; AVX-NEXT: movl %eax, %ecx 102 ; AVX-NEXT: andl $21845, %ecx # imm = 0x5555 103 ; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA 104 ; AVX-NEXT: shrl %eax 105 ; AVX-NEXT: leal (%rax,%rcx,2), %eax 106 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax 107 ; AVX-NEXT: retq 108 ; 109 ; XOP-LABEL: test_bitreverse_i16: 110 ; XOP: # %bb.0: 111 ; XOP-NEXT: vmovd %edi, %xmm0 112 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 113 ; XOP-NEXT: vmovd %xmm0, %eax 114 ; XOP-NEXT: # kill: def $ax killed $ax killed $eax 115 ; XOP-NEXT: retq 116 %b = call i16 @llvm.bitreverse.i16(i16 %a) 117 ret i16 %b 118 } 119 120 define i32 @test_bitreverse_i32(i32 %a) nounwind { 121 ; SSE-LABEL: test_bitreverse_i32: 122 ; SSE: # %bb.0: 123 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi 124 ; SSE-NEXT: bswapl %edi 125 ; SSE-NEXT: movl %edi, %eax 126 ; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 127 ; SSE-NEXT: shll $4, %eax 128 ; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 129 ; SSE-NEXT: shrl $4, %edi 130 ; SSE-NEXT: orl %eax, %edi 131 ; SSE-NEXT: movl %edi, %eax 132 ; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333 133 ; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 134 ; SSE-NEXT: shrl $2, %edi 135 ; SSE-NEXT: leal (%rdi,%rax,4), %eax 136 ; SSE-NEXT: movl %eax, %ecx 137 ; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555 138 ; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 139 ; SSE-NEXT: shrl %eax 140 ; SSE-NEXT: leal (%rax,%rcx,2), %eax 141 ; SSE-NEXT: retq 142 ; 143 ; AVX-LABEL: test_bitreverse_i32: 144 ; AVX: # %bb.0: 145 ; AVX-NEXT: # kill: def $edi killed $edi def $rdi 146 ; AVX-NEXT: bswapl %edi 147 ; AVX-NEXT: movl %edi, %eax 148 ; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F 149 ; AVX-NEXT: shll $4, %eax 150 ; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 151 ; AVX-NEXT: shrl $4, %edi 152 ; AVX-NEXT: orl %eax, %edi 153 ; AVX-NEXT: movl %edi, %eax 154 ; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333 155 ; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC 156 ; AVX-NEXT: shrl $2, %edi 157 ; AVX-NEXT: leal (%rdi,%rax,4), %eax 158 ; AVX-NEXT: movl %eax, %ecx 159 ; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555 160 ; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA 161 ; AVX-NEXT: shrl %eax 162 ; AVX-NEXT: leal (%rax,%rcx,2), %eax 163 ; AVX-NEXT: retq 164 ; 165 ; XOP-LABEL: test_bitreverse_i32: 166 ; XOP: # %bb.0: 167 ; XOP-NEXT: vmovd %edi, %xmm0 168 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 169 ; XOP-NEXT: vmovd %xmm0, %eax 170 ; XOP-NEXT: retq 171 %b = call i32 @llvm.bitreverse.i32(i32 %a) 172 ret i32 %b 173 } 174 175 define i64 @test_bitreverse_i64(i64 %a) nounwind { 176 ; SSE-LABEL: test_bitreverse_i64: 177 ; SSE: # %bb.0: 178 ; SSE-NEXT: bswapq %rdi 179 ; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 180 ; SSE-NEXT: andq %rdi, %rax 181 ; SSE-NEXT: shlq $4, %rax 182 ; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 183 ; SSE-NEXT: andq %rdi, %rcx 184 ; SSE-NEXT: shrq $4, %rcx 185 ; SSE-NEXT: orq %rax, %rcx 186 ; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 187 ; SSE-NEXT: andq %rcx, %rax 188 ; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 189 ; SSE-NEXT: andq %rcx, %rdx 190 ; SSE-NEXT: shrq $2, %rdx 191 ; SSE-NEXT: leaq (%rdx,%rax,4), %rax 192 ; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 193 ; SSE-NEXT: andq %rax, %rcx 194 ; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 195 ; SSE-NEXT: andq %rax, %rdx 196 ; SSE-NEXT: shrq %rdx 197 ; SSE-NEXT: leaq (%rdx,%rcx,2), %rax 198 ; SSE-NEXT: retq 199 ; 200 ; AVX-LABEL: test_bitreverse_i64: 201 ; AVX: # %bb.0: 202 ; AVX-NEXT: bswapq %rdi 203 ; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F 204 ; AVX-NEXT: andq %rdi, %rax 205 ; AVX-NEXT: shlq $4, %rax 206 ; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0 207 ; AVX-NEXT: andq %rdi, %rcx 208 ; AVX-NEXT: shrq $4, %rcx 209 ; AVX-NEXT: orq %rax, %rcx 210 ; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 211 ; AVX-NEXT: andq %rcx, %rax 212 ; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC 213 ; AVX-NEXT: andq %rcx, %rdx 214 ; AVX-NEXT: shrq $2, %rdx 215 ; AVX-NEXT: leaq (%rdx,%rax,4), %rax 216 ; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 217 ; AVX-NEXT: andq %rax, %rcx 218 ; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA 219 ; AVX-NEXT: andq %rax, %rdx 220 ; AVX-NEXT: shrq %rdx 221 ; AVX-NEXT: leaq (%rdx,%rcx,2), %rax 222 ; AVX-NEXT: retq 223 ; 224 ; XOP-LABEL: test_bitreverse_i64: 225 ; XOP: # %bb.0: 226 ; XOP-NEXT: vmovq %rdi, %xmm0 227 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 228 ; XOP-NEXT: vmovq %xmm0, %rax 229 ; XOP-NEXT: retq 230 %b = call i64 @llvm.bitreverse.i64(i64 %a) 231 ret i64 %b 232 } 233 234 define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 235 ; SSE2-LABEL: test_bitreverse_v16i8: 236 ; SSE2: # %bb.0: 237 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 238 ; SSE2-NEXT: movdqa %xmm0, %xmm2 239 ; SSE2-NEXT: pand %xmm1, %xmm2 240 ; SSE2-NEXT: psllw $4, %xmm2 241 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 242 ; SSE2-NEXT: pand %xmm3, %xmm2 243 ; SSE2-NEXT: pand %xmm3, %xmm0 244 ; SSE2-NEXT: psrlw $4, %xmm0 245 ; SSE2-NEXT: pand %xmm1, %xmm0 246 ; SSE2-NEXT: por %xmm2, %xmm0 247 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 248 ; SSE2-NEXT: pand %xmm0, %xmm1 249 ; SSE2-NEXT: psllw $2, %xmm1 250 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 251 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 252 ; SSE2-NEXT: psrlw $2, %xmm0 253 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 254 ; SSE2-NEXT: por %xmm1, %xmm0 255 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 256 ; SSE2-NEXT: pand %xmm0, %xmm1 257 ; SSE2-NEXT: paddb %xmm1, %xmm1 258 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 259 ; SSE2-NEXT: psrlw $1, %xmm0 260 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 261 ; SSE2-NEXT: por %xmm1, %xmm0 262 ; SSE2-NEXT: retq 263 ; 264 ; SSSE3-LABEL: test_bitreverse_v16i8: 265 ; SSSE3: # %bb.0: 266 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 267 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 268 ; SSSE3-NEXT: pand %xmm1, %xmm2 269 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 270 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 271 ; SSSE3-NEXT: psrlw $4, %xmm0 272 ; SSSE3-NEXT: pand %xmm1, %xmm0 273 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 274 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 275 ; SSSE3-NEXT: por %xmm3, %xmm1 276 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 277 ; SSSE3-NEXT: retq 278 ; 279 ; AVX-LABEL: test_bitreverse_v16i8: 280 ; AVX: # %bb.0: 281 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 282 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 283 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 284 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 285 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 286 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 287 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 288 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 289 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 290 ; AVX-NEXT: retq 291 ; 292 ; XOP-LABEL: test_bitreverse_v16i8: 293 ; XOP: # %bb.0: 294 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 295 ; XOP-NEXT: retq 296 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 297 ret <16 x i8> %b 298 } 299 300 define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 301 ; SSE2-LABEL: test_bitreverse_v8i16: 302 ; SSE2: # %bb.0: 303 ; SSE2-NEXT: pxor %xmm1, %xmm1 304 ; SSE2-NEXT: movdqa %xmm0, %xmm2 305 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 306 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 307 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 308 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 309 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 310 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 311 ; SSE2-NEXT: packuswb %xmm2, %xmm0 312 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 313 ; SSE2-NEXT: movdqa %xmm0, %xmm2 314 ; SSE2-NEXT: pand %xmm1, %xmm2 315 ; SSE2-NEXT: psllw $4, %xmm2 316 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 317 ; SSE2-NEXT: pand %xmm3, %xmm2 318 ; SSE2-NEXT: pand %xmm3, %xmm0 319 ; SSE2-NEXT: psrlw $4, %xmm0 320 ; SSE2-NEXT: pand %xmm1, %xmm0 321 ; SSE2-NEXT: por %xmm2, %xmm0 322 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 323 ; SSE2-NEXT: pand %xmm0, %xmm1 324 ; SSE2-NEXT: psllw $2, %xmm1 325 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 326 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 327 ; SSE2-NEXT: psrlw $2, %xmm0 328 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 329 ; SSE2-NEXT: por %xmm1, %xmm0 330 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 331 ; SSE2-NEXT: pand %xmm0, %xmm1 332 ; SSE2-NEXT: paddb %xmm1, %xmm1 333 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 334 ; SSE2-NEXT: psrlw $1, %xmm0 335 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 336 ; SSE2-NEXT: por %xmm1, %xmm0 337 ; SSE2-NEXT: retq 338 ; 339 ; SSSE3-LABEL: test_bitreverse_v8i16: 340 ; SSSE3: # %bb.0: 341 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 342 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 343 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 344 ; SSSE3-NEXT: pand %xmm1, %xmm2 345 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 346 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 347 ; SSSE3-NEXT: psrlw $4, %xmm0 348 ; SSSE3-NEXT: pand %xmm1, %xmm0 349 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 350 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 351 ; SSSE3-NEXT: por %xmm3, %xmm1 352 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 353 ; SSSE3-NEXT: retq 354 ; 355 ; AVX-LABEL: test_bitreverse_v8i16: 356 ; AVX: # %bb.0: 357 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 358 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 359 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 360 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 361 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 362 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 363 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 364 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 365 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 366 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 367 ; AVX-NEXT: retq 368 ; 369 ; XOP-LABEL: test_bitreverse_v8i16: 370 ; XOP: # %bb.0: 371 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 372 ; XOP-NEXT: retq 373 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 374 ret <8 x i16> %b 375 } 376 377 define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 378 ; SSE2-LABEL: test_bitreverse_v4i32: 379 ; SSE2: # %bb.0: 380 ; SSE2-NEXT: pxor %xmm1, %xmm1 381 ; SSE2-NEXT: movdqa %xmm0, %xmm2 382 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 383 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 384 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 385 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 386 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 387 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 388 ; SSE2-NEXT: packuswb %xmm2, %xmm0 389 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 390 ; SSE2-NEXT: movdqa %xmm0, %xmm2 391 ; SSE2-NEXT: pand %xmm1, %xmm2 392 ; SSE2-NEXT: psllw $4, %xmm2 393 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 394 ; SSE2-NEXT: pand %xmm3, %xmm2 395 ; SSE2-NEXT: pand %xmm3, %xmm0 396 ; SSE2-NEXT: psrlw $4, %xmm0 397 ; SSE2-NEXT: pand %xmm1, %xmm0 398 ; SSE2-NEXT: por %xmm2, %xmm0 399 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 400 ; SSE2-NEXT: pand %xmm0, %xmm1 401 ; SSE2-NEXT: psllw $2, %xmm1 402 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 403 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 404 ; SSE2-NEXT: psrlw $2, %xmm0 405 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 406 ; SSE2-NEXT: por %xmm1, %xmm0 407 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 408 ; SSE2-NEXT: pand %xmm0, %xmm1 409 ; SSE2-NEXT: paddb %xmm1, %xmm1 410 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 411 ; SSE2-NEXT: psrlw $1, %xmm0 412 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 413 ; SSE2-NEXT: por %xmm1, %xmm0 414 ; SSE2-NEXT: retq 415 ; 416 ; SSSE3-LABEL: test_bitreverse_v4i32: 417 ; SSSE3: # %bb.0: 418 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 419 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 420 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 421 ; SSSE3-NEXT: pand %xmm1, %xmm2 422 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 423 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 424 ; SSSE3-NEXT: psrlw $4, %xmm0 425 ; SSSE3-NEXT: pand %xmm1, %xmm0 426 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 427 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 428 ; SSSE3-NEXT: por %xmm3, %xmm1 429 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 430 ; SSSE3-NEXT: retq 431 ; 432 ; AVX-LABEL: test_bitreverse_v4i32: 433 ; AVX: # %bb.0: 434 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 435 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 436 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 437 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 438 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 439 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 440 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 441 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 442 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 443 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 444 ; AVX-NEXT: retq 445 ; 446 ; XOP-LABEL: test_bitreverse_v4i32: 447 ; XOP: # %bb.0: 448 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 449 ; XOP-NEXT: retq 450 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 451 ret <4 x i32> %b 452 } 453 454 define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 455 ; SSE2-LABEL: test_bitreverse_v2i64: 456 ; SSE2: # %bb.0: 457 ; SSE2-NEXT: pxor %xmm1, %xmm1 458 ; SSE2-NEXT: movdqa %xmm0, %xmm2 459 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 460 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 461 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 462 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 463 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 464 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 465 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 466 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 467 ; SSE2-NEXT: packuswb %xmm2, %xmm0 468 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 469 ; SSE2-NEXT: movdqa %xmm0, %xmm2 470 ; SSE2-NEXT: pand %xmm1, %xmm2 471 ; SSE2-NEXT: psllw $4, %xmm2 472 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 473 ; SSE2-NEXT: pand %xmm3, %xmm2 474 ; SSE2-NEXT: pand %xmm3, %xmm0 475 ; SSE2-NEXT: psrlw $4, %xmm0 476 ; SSE2-NEXT: pand %xmm1, %xmm0 477 ; SSE2-NEXT: por %xmm2, %xmm0 478 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 479 ; SSE2-NEXT: pand %xmm0, %xmm1 480 ; SSE2-NEXT: psllw $2, %xmm1 481 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 482 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 483 ; SSE2-NEXT: psrlw $2, %xmm0 484 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 485 ; SSE2-NEXT: por %xmm1, %xmm0 486 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 487 ; SSE2-NEXT: pand %xmm0, %xmm1 488 ; SSE2-NEXT: paddb %xmm1, %xmm1 489 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 490 ; SSE2-NEXT: psrlw $1, %xmm0 491 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 492 ; SSE2-NEXT: por %xmm1, %xmm0 493 ; SSE2-NEXT: retq 494 ; 495 ; SSSE3-LABEL: test_bitreverse_v2i64: 496 ; SSSE3: # %bb.0: 497 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 498 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 499 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 500 ; SSSE3-NEXT: pand %xmm1, %xmm2 501 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 502 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 503 ; SSSE3-NEXT: psrlw $4, %xmm0 504 ; SSSE3-NEXT: pand %xmm1, %xmm0 505 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 506 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 507 ; SSSE3-NEXT: por %xmm3, %xmm1 508 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 509 ; SSSE3-NEXT: retq 510 ; 511 ; AVX-LABEL: test_bitreverse_v2i64: 512 ; AVX: # %bb.0: 513 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 514 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 515 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 516 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 517 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 518 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 519 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 520 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 521 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 522 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 523 ; AVX-NEXT: retq 524 ; 525 ; XOP-LABEL: test_bitreverse_v2i64: 526 ; XOP: # %bb.0: 527 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 528 ; XOP-NEXT: retq 529 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 530 ret <2 x i64> %b 531 } 532 533 define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 534 ; SSE2-LABEL: test_bitreverse_v32i8: 535 ; SSE2: # %bb.0: 536 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 537 ; SSE2-NEXT: movdqa %xmm0, %xmm3 538 ; SSE2-NEXT: pand %xmm2, %xmm3 539 ; SSE2-NEXT: psllw $4, %xmm3 540 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 541 ; SSE2-NEXT: pand %xmm5, %xmm3 542 ; SSE2-NEXT: pand %xmm5, %xmm0 543 ; SSE2-NEXT: psrlw $4, %xmm0 544 ; SSE2-NEXT: pand %xmm2, %xmm0 545 ; SSE2-NEXT: por %xmm3, %xmm0 546 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 547 ; SSE2-NEXT: movdqa %xmm0, %xmm4 548 ; SSE2-NEXT: pand %xmm3, %xmm4 549 ; SSE2-NEXT: psllw $2, %xmm4 550 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 551 ; SSE2-NEXT: pand %xmm8, %xmm4 552 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 553 ; SSE2-NEXT: pand %xmm9, %xmm0 554 ; SSE2-NEXT: psrlw $2, %xmm0 555 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 556 ; SSE2-NEXT: pand %xmm10, %xmm0 557 ; SSE2-NEXT: por %xmm4, %xmm0 558 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 559 ; SSE2-NEXT: movdqa %xmm0, %xmm7 560 ; SSE2-NEXT: pand %xmm4, %xmm7 561 ; SSE2-NEXT: psrlw $1, %xmm7 562 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 563 ; SSE2-NEXT: pand %xmm11, %xmm7 564 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 565 ; SSE2-NEXT: pand %xmm6, %xmm0 566 ; SSE2-NEXT: paddb %xmm0, %xmm0 567 ; SSE2-NEXT: por %xmm7, %xmm0 568 ; SSE2-NEXT: movdqa %xmm1, %xmm7 569 ; SSE2-NEXT: pand %xmm2, %xmm7 570 ; SSE2-NEXT: psllw $4, %xmm7 571 ; SSE2-NEXT: pand %xmm5, %xmm7 572 ; SSE2-NEXT: pand %xmm5, %xmm1 573 ; SSE2-NEXT: psrlw $4, %xmm1 574 ; SSE2-NEXT: pand %xmm2, %xmm1 575 ; SSE2-NEXT: por %xmm7, %xmm1 576 ; SSE2-NEXT: pand %xmm1, %xmm3 577 ; SSE2-NEXT: psllw $2, %xmm3 578 ; SSE2-NEXT: pand %xmm8, %xmm3 579 ; SSE2-NEXT: pand %xmm9, %xmm1 580 ; SSE2-NEXT: psrlw $2, %xmm1 581 ; SSE2-NEXT: pand %xmm10, %xmm1 582 ; SSE2-NEXT: por %xmm3, %xmm1 583 ; SSE2-NEXT: pand %xmm1, %xmm4 584 ; SSE2-NEXT: psrlw $1, %xmm4 585 ; SSE2-NEXT: pand %xmm11, %xmm4 586 ; SSE2-NEXT: pand %xmm6, %xmm1 587 ; SSE2-NEXT: paddb %xmm1, %xmm1 588 ; SSE2-NEXT: por %xmm4, %xmm1 589 ; SSE2-NEXT: retq 590 ; 591 ; SSSE3-LABEL: test_bitreverse_v32i8: 592 ; SSSE3: # %bb.0: 593 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 594 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 595 ; SSSE3-NEXT: pand %xmm4, %xmm2 596 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 597 ; SSSE3-NEXT: movdqa %xmm5, %xmm6 598 ; SSSE3-NEXT: pshufb %xmm2, %xmm6 599 ; SSSE3-NEXT: psrlw $4, %xmm0 600 ; SSSE3-NEXT: pand %xmm4, %xmm0 601 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 602 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 603 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 604 ; SSSE3-NEXT: por %xmm6, %xmm3 605 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 606 ; SSSE3-NEXT: pand %xmm4, %xmm0 607 ; SSSE3-NEXT: pshufb %xmm0, %xmm5 608 ; SSSE3-NEXT: psrlw $4, %xmm1 609 ; SSSE3-NEXT: pand %xmm4, %xmm1 610 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 611 ; SSSE3-NEXT: por %xmm5, %xmm2 612 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 613 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 614 ; SSSE3-NEXT: retq 615 ; 616 ; AVX1-LABEL: test_bitreverse_v32i8: 617 ; AVX1: # %bb.0: 618 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 619 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 620 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 621 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 622 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 623 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 624 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 625 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 626 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 627 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 628 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 629 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 630 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 631 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 632 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 633 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 634 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 635 ; AVX1-NEXT: retq 636 ; 637 ; AVX2-LABEL: test_bitreverse_v32i8: 638 ; AVX2: # %bb.0: 639 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 640 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 641 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 642 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 643 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 644 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 645 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 646 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 647 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 648 ; AVX2-NEXT: retq 649 ; 650 ; AVX512-LABEL: test_bitreverse_v32i8: 651 ; AVX512: # %bb.0: 652 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 653 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 654 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 655 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 656 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 657 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 658 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 659 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 660 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 661 ; AVX512-NEXT: retq 662 ; 663 ; XOPAVX1-LABEL: test_bitreverse_v32i8: 664 ; XOPAVX1: # %bb.0: 665 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 666 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 667 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 668 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 669 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 670 ; XOPAVX1-NEXT: retq 671 ; 672 ; XOPAVX2-LABEL: test_bitreverse_v32i8: 673 ; XOPAVX2: # %bb.0: 674 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 675 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 676 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 677 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 678 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 679 ; XOPAVX2-NEXT: retq 680 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 681 ret <32 x i8> %b 682 } 683 684 define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 685 ; SSE2-LABEL: test_bitreverse_v16i16: 686 ; SSE2: # %bb.0: 687 ; SSE2-NEXT: pxor %xmm4, %xmm4 688 ; SSE2-NEXT: movdqa %xmm0, %xmm2 689 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 690 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 691 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 692 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 693 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 694 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 695 ; SSE2-NEXT: packuswb %xmm2, %xmm0 696 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 697 ; SSE2-NEXT: movdqa %xmm0, %xmm3 698 ; SSE2-NEXT: pand %xmm2, %xmm3 699 ; SSE2-NEXT: psllw $4, %xmm3 700 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 701 ; SSE2-NEXT: pand %xmm6, %xmm3 702 ; SSE2-NEXT: pand %xmm6, %xmm0 703 ; SSE2-NEXT: psrlw $4, %xmm0 704 ; SSE2-NEXT: pand %xmm2, %xmm0 705 ; SSE2-NEXT: por %xmm3, %xmm0 706 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 707 ; SSE2-NEXT: movdqa %xmm0, %xmm5 708 ; SSE2-NEXT: pand %xmm3, %xmm5 709 ; SSE2-NEXT: psllw $2, %xmm5 710 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 711 ; SSE2-NEXT: pand %xmm8, %xmm5 712 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 713 ; SSE2-NEXT: pand %xmm9, %xmm0 714 ; SSE2-NEXT: psrlw $2, %xmm0 715 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 716 ; SSE2-NEXT: pand %xmm10, %xmm0 717 ; SSE2-NEXT: por %xmm5, %xmm0 718 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 719 ; SSE2-NEXT: movdqa %xmm0, %xmm7 720 ; SSE2-NEXT: pand %xmm5, %xmm7 721 ; SSE2-NEXT: psrlw $1, %xmm7 722 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 723 ; SSE2-NEXT: pand %xmm11, %xmm7 724 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 725 ; SSE2-NEXT: pand %xmm12, %xmm0 726 ; SSE2-NEXT: paddb %xmm0, %xmm0 727 ; SSE2-NEXT: por %xmm7, %xmm0 728 ; SSE2-NEXT: movdqa %xmm1, %xmm7 729 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 730 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] 731 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6] 732 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 733 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 734 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 735 ; SSE2-NEXT: packuswb %xmm7, %xmm1 736 ; SSE2-NEXT: movdqa %xmm1, %xmm4 737 ; SSE2-NEXT: pand %xmm2, %xmm4 738 ; SSE2-NEXT: psllw $4, %xmm4 739 ; SSE2-NEXT: pand %xmm6, %xmm4 740 ; SSE2-NEXT: pand %xmm6, %xmm1 741 ; SSE2-NEXT: psrlw $4, %xmm1 742 ; SSE2-NEXT: pand %xmm2, %xmm1 743 ; SSE2-NEXT: por %xmm4, %xmm1 744 ; SSE2-NEXT: pand %xmm1, %xmm3 745 ; SSE2-NEXT: psllw $2, %xmm3 746 ; SSE2-NEXT: pand %xmm8, %xmm3 747 ; SSE2-NEXT: pand %xmm9, %xmm1 748 ; SSE2-NEXT: psrlw $2, %xmm1 749 ; SSE2-NEXT: pand %xmm10, %xmm1 750 ; SSE2-NEXT: por %xmm3, %xmm1 751 ; SSE2-NEXT: pand %xmm1, %xmm5 752 ; SSE2-NEXT: psrlw $1, %xmm5 753 ; SSE2-NEXT: pand %xmm11, %xmm5 754 ; SSE2-NEXT: pand %xmm12, %xmm1 755 ; SSE2-NEXT: paddb %xmm1, %xmm1 756 ; SSE2-NEXT: por %xmm5, %xmm1 757 ; SSE2-NEXT: retq 758 ; 759 ; SSSE3-LABEL: test_bitreverse_v16i16: 760 ; SSSE3: # %bb.0: 761 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 762 ; SSSE3-NEXT: pshufb %xmm4, %xmm0 763 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 764 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 765 ; SSSE3-NEXT: pand %xmm5, %xmm2 766 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 767 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 768 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 769 ; SSSE3-NEXT: psrlw $4, %xmm0 770 ; SSSE3-NEXT: pand %xmm5, %xmm0 771 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 772 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 773 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 774 ; SSSE3-NEXT: por %xmm7, %xmm3 775 ; SSSE3-NEXT: pshufb %xmm4, %xmm1 776 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 777 ; SSSE3-NEXT: pand %xmm5, %xmm0 778 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 779 ; SSSE3-NEXT: psrlw $4, %xmm1 780 ; SSSE3-NEXT: pand %xmm5, %xmm1 781 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 782 ; SSSE3-NEXT: por %xmm6, %xmm2 783 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 784 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 785 ; SSSE3-NEXT: retq 786 ; 787 ; AVX1-LABEL: test_bitreverse_v16i16: 788 ; AVX1: # %bb.0: 789 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 790 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 791 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 792 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 793 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 794 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 795 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 796 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 797 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 798 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 799 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 800 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 801 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 802 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 803 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 804 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 805 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 806 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 807 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 808 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 809 ; AVX1-NEXT: retq 810 ; 811 ; AVX2-LABEL: test_bitreverse_v16i16: 812 ; AVX2: # %bb.0: 813 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 814 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 815 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 816 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 817 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 818 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 819 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 820 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 821 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 822 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 823 ; AVX2-NEXT: retq 824 ; 825 ; AVX512-LABEL: test_bitreverse_v16i16: 826 ; AVX512: # %bb.0: 827 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 828 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 829 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 830 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 831 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 832 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 833 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 834 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 835 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 836 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 837 ; AVX512-NEXT: retq 838 ; 839 ; XOPAVX1-LABEL: test_bitreverse_v16i16: 840 ; XOPAVX1: # %bb.0: 841 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 842 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 843 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 844 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 845 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 846 ; XOPAVX1-NEXT: retq 847 ; 848 ; XOPAVX2-LABEL: test_bitreverse_v16i16: 849 ; XOPAVX2: # %bb.0: 850 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 851 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 852 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 853 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 854 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 855 ; XOPAVX2-NEXT: retq 856 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 857 ret <16 x i16> %b 858 } 859 860 define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 861 ; SSE2-LABEL: test_bitreverse_v8i32: 862 ; SSE2: # %bb.0: 863 ; SSE2-NEXT: pxor %xmm4, %xmm4 864 ; SSE2-NEXT: movdqa %xmm0, %xmm2 865 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 866 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 867 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 868 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 869 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 870 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 871 ; SSE2-NEXT: packuswb %xmm2, %xmm0 872 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 873 ; SSE2-NEXT: movdqa %xmm0, %xmm3 874 ; SSE2-NEXT: pand %xmm2, %xmm3 875 ; SSE2-NEXT: psllw $4, %xmm3 876 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 877 ; SSE2-NEXT: pand %xmm6, %xmm3 878 ; SSE2-NEXT: pand %xmm6, %xmm0 879 ; SSE2-NEXT: psrlw $4, %xmm0 880 ; SSE2-NEXT: pand %xmm2, %xmm0 881 ; SSE2-NEXT: por %xmm3, %xmm0 882 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 883 ; SSE2-NEXT: movdqa %xmm0, %xmm5 884 ; SSE2-NEXT: pand %xmm3, %xmm5 885 ; SSE2-NEXT: psllw $2, %xmm5 886 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 887 ; SSE2-NEXT: pand %xmm8, %xmm5 888 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 889 ; SSE2-NEXT: pand %xmm9, %xmm0 890 ; SSE2-NEXT: psrlw $2, %xmm0 891 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 892 ; SSE2-NEXT: pand %xmm10, %xmm0 893 ; SSE2-NEXT: por %xmm5, %xmm0 894 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 895 ; SSE2-NEXT: movdqa %xmm0, %xmm7 896 ; SSE2-NEXT: pand %xmm5, %xmm7 897 ; SSE2-NEXT: psrlw $1, %xmm7 898 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 899 ; SSE2-NEXT: pand %xmm11, %xmm7 900 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 901 ; SSE2-NEXT: pand %xmm12, %xmm0 902 ; SSE2-NEXT: paddb %xmm0, %xmm0 903 ; SSE2-NEXT: por %xmm7, %xmm0 904 ; SSE2-NEXT: movdqa %xmm1, %xmm7 905 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 906 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 907 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 908 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 909 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 910 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 911 ; SSE2-NEXT: packuswb %xmm7, %xmm1 912 ; SSE2-NEXT: movdqa %xmm1, %xmm4 913 ; SSE2-NEXT: pand %xmm2, %xmm4 914 ; SSE2-NEXT: psllw $4, %xmm4 915 ; SSE2-NEXT: pand %xmm6, %xmm4 916 ; SSE2-NEXT: pand %xmm6, %xmm1 917 ; SSE2-NEXT: psrlw $4, %xmm1 918 ; SSE2-NEXT: pand %xmm2, %xmm1 919 ; SSE2-NEXT: por %xmm4, %xmm1 920 ; SSE2-NEXT: pand %xmm1, %xmm3 921 ; SSE2-NEXT: psllw $2, %xmm3 922 ; SSE2-NEXT: pand %xmm8, %xmm3 923 ; SSE2-NEXT: pand %xmm9, %xmm1 924 ; SSE2-NEXT: psrlw $2, %xmm1 925 ; SSE2-NEXT: pand %xmm10, %xmm1 926 ; SSE2-NEXT: por %xmm3, %xmm1 927 ; SSE2-NEXT: pand %xmm1, %xmm5 928 ; SSE2-NEXT: psrlw $1, %xmm5 929 ; SSE2-NEXT: pand %xmm11, %xmm5 930 ; SSE2-NEXT: pand %xmm12, %xmm1 931 ; SSE2-NEXT: paddb %xmm1, %xmm1 932 ; SSE2-NEXT: por %xmm5, %xmm1 933 ; SSE2-NEXT: retq 934 ; 935 ; SSSE3-LABEL: test_bitreverse_v8i32: 936 ; SSSE3: # %bb.0: 937 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 938 ; SSSE3-NEXT: pshufb %xmm4, %xmm0 939 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 940 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 941 ; SSSE3-NEXT: pand %xmm5, %xmm2 942 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 943 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 944 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 945 ; SSSE3-NEXT: psrlw $4, %xmm0 946 ; SSSE3-NEXT: pand %xmm5, %xmm0 947 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 948 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 949 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 950 ; SSSE3-NEXT: por %xmm7, %xmm3 951 ; SSSE3-NEXT: pshufb %xmm4, %xmm1 952 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 953 ; SSSE3-NEXT: pand %xmm5, %xmm0 954 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 955 ; SSSE3-NEXT: psrlw $4, %xmm1 956 ; SSSE3-NEXT: pand %xmm5, %xmm1 957 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 958 ; SSSE3-NEXT: por %xmm6, %xmm2 959 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 960 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 961 ; SSSE3-NEXT: retq 962 ; 963 ; AVX1-LABEL: test_bitreverse_v8i32: 964 ; AVX1: # %bb.0: 965 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 966 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 967 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 968 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 969 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 970 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 971 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 972 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 973 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 974 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 975 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 976 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 977 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 978 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 979 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 980 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 981 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 982 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 983 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 984 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 985 ; AVX1-NEXT: retq 986 ; 987 ; AVX2-LABEL: test_bitreverse_v8i32: 988 ; AVX2: # %bb.0: 989 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 990 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 991 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 992 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 993 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 994 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 995 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 996 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 997 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 998 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 999 ; AVX2-NEXT: retq 1000 ; 1001 ; AVX512-LABEL: test_bitreverse_v8i32: 1002 ; AVX512: # %bb.0: 1003 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1004 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1005 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1006 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1007 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1008 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1009 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1010 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1011 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1012 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1013 ; AVX512-NEXT: retq 1014 ; 1015 ; XOPAVX1-LABEL: test_bitreverse_v8i32: 1016 ; XOPAVX1: # %bb.0: 1017 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1018 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1019 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1020 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1021 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1022 ; XOPAVX1-NEXT: retq 1023 ; 1024 ; XOPAVX2-LABEL: test_bitreverse_v8i32: 1025 ; XOPAVX2: # %bb.0: 1026 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1027 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1028 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1029 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1030 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1031 ; XOPAVX2-NEXT: retq 1032 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1033 ret <8 x i32> %b 1034 } 1035 1036 define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1037 ; SSE2-LABEL: test_bitreverse_v4i64: 1038 ; SSE2: # %bb.0: 1039 ; SSE2-NEXT: pxor %xmm4, %xmm4 1040 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1041 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 1042 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1043 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1044 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1045 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 1046 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1047 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1048 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1049 ; SSE2-NEXT: packuswb %xmm2, %xmm0 1050 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1051 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1052 ; SSE2-NEXT: pand %xmm2, %xmm3 1053 ; SSE2-NEXT: psllw $4, %xmm3 1054 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1055 ; SSE2-NEXT: pand %xmm6, %xmm3 1056 ; SSE2-NEXT: pand %xmm6, %xmm0 1057 ; SSE2-NEXT: psrlw $4, %xmm0 1058 ; SSE2-NEXT: pand %xmm2, %xmm0 1059 ; SSE2-NEXT: por %xmm3, %xmm0 1060 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1061 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1062 ; SSE2-NEXT: pand %xmm3, %xmm5 1063 ; SSE2-NEXT: psllw $2, %xmm5 1064 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1065 ; SSE2-NEXT: pand %xmm8, %xmm5 1066 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1067 ; SSE2-NEXT: pand %xmm9, %xmm0 1068 ; SSE2-NEXT: psrlw $2, %xmm0 1069 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1070 ; SSE2-NEXT: pand %xmm10, %xmm0 1071 ; SSE2-NEXT: por %xmm5, %xmm0 1072 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1073 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1074 ; SSE2-NEXT: pand %xmm5, %xmm7 1075 ; SSE2-NEXT: psrlw $1, %xmm7 1076 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1077 ; SSE2-NEXT: pand %xmm11, %xmm7 1078 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1079 ; SSE2-NEXT: pand %xmm12, %xmm0 1080 ; SSE2-NEXT: paddb %xmm0, %xmm0 1081 ; SSE2-NEXT: por %xmm7, %xmm0 1082 ; SSE2-NEXT: movdqa %xmm1, %xmm7 1083 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] 1084 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] 1085 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,2,1,0,4,5,6,7] 1086 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,5,4] 1087 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 1088 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1089 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1090 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1091 ; SSE2-NEXT: packuswb %xmm7, %xmm1 1092 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1093 ; SSE2-NEXT: pand %xmm2, %xmm4 1094 ; SSE2-NEXT: psllw $4, %xmm4 1095 ; SSE2-NEXT: pand %xmm6, %xmm4 1096 ; SSE2-NEXT: pand %xmm6, %xmm1 1097 ; SSE2-NEXT: psrlw $4, %xmm1 1098 ; SSE2-NEXT: pand %xmm2, %xmm1 1099 ; SSE2-NEXT: por %xmm4, %xmm1 1100 ; SSE2-NEXT: pand %xmm1, %xmm3 1101 ; SSE2-NEXT: psllw $2, %xmm3 1102 ; SSE2-NEXT: pand %xmm8, %xmm3 1103 ; SSE2-NEXT: pand %xmm9, %xmm1 1104 ; SSE2-NEXT: psrlw $2, %xmm1 1105 ; SSE2-NEXT: pand %xmm10, %xmm1 1106 ; SSE2-NEXT: por %xmm3, %xmm1 1107 ; SSE2-NEXT: pand %xmm1, %xmm5 1108 ; SSE2-NEXT: psrlw $1, %xmm5 1109 ; SSE2-NEXT: pand %xmm11, %xmm5 1110 ; SSE2-NEXT: pand %xmm12, %xmm1 1111 ; SSE2-NEXT: paddb %xmm1, %xmm1 1112 ; SSE2-NEXT: por %xmm5, %xmm1 1113 ; SSE2-NEXT: retq 1114 ; 1115 ; SSSE3-LABEL: test_bitreverse_v4i64: 1116 ; SSSE3: # %bb.0: 1117 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1118 ; SSSE3-NEXT: pshufb %xmm4, %xmm0 1119 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1120 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1121 ; SSSE3-NEXT: pand %xmm5, %xmm2 1122 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1123 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 1124 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 1125 ; SSSE3-NEXT: psrlw $4, %xmm0 1126 ; SSSE3-NEXT: pand %xmm5, %xmm0 1127 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1128 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 1129 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 1130 ; SSSE3-NEXT: por %xmm7, %xmm3 1131 ; SSSE3-NEXT: pshufb %xmm4, %xmm1 1132 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1133 ; SSSE3-NEXT: pand %xmm5, %xmm0 1134 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 1135 ; SSSE3-NEXT: psrlw $4, %xmm1 1136 ; SSSE3-NEXT: pand %xmm5, %xmm1 1137 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 1138 ; SSSE3-NEXT: por %xmm6, %xmm2 1139 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 1140 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 1141 ; SSSE3-NEXT: retq 1142 ; 1143 ; AVX1-LABEL: test_bitreverse_v4i64: 1144 ; AVX1: # %bb.0: 1145 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1146 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1147 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1148 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1149 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1150 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1151 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1152 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1153 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1154 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1155 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1156 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1157 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1158 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1159 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1160 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1161 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1162 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1163 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1164 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1165 ; AVX1-NEXT: retq 1166 ; 1167 ; AVX2-LABEL: test_bitreverse_v4i64: 1168 ; AVX2: # %bb.0: 1169 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1170 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1171 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1172 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1173 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1174 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1175 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1176 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1177 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1178 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1179 ; AVX2-NEXT: retq 1180 ; 1181 ; AVX512-LABEL: test_bitreverse_v4i64: 1182 ; AVX512: # %bb.0: 1183 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 1184 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1185 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1186 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1187 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1188 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1189 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1190 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1191 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1192 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1193 ; AVX512-NEXT: retq 1194 ; 1195 ; XOPAVX1-LABEL: test_bitreverse_v4i64: 1196 ; XOPAVX1: # %bb.0: 1197 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1198 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1199 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1200 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1201 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1202 ; XOPAVX1-NEXT: retq 1203 ; 1204 ; XOPAVX2-LABEL: test_bitreverse_v4i64: 1205 ; XOPAVX2: # %bb.0: 1206 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1207 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 1208 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1209 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1210 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1211 ; XOPAVX2-NEXT: retq 1212 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 1213 ret <4 x i64> %b 1214 } 1215 1216 define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 1217 ; SSE2-LABEL: test_bitreverse_v64i8: 1218 ; SSE2: # %bb.0: 1219 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1220 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1221 ; SSE2-NEXT: pand %xmm13, %xmm5 1222 ; SSE2-NEXT: psllw $4, %xmm5 1223 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1224 ; SSE2-NEXT: pand %xmm7, %xmm5 1225 ; SSE2-NEXT: pand %xmm7, %xmm0 1226 ; SSE2-NEXT: psrlw $4, %xmm0 1227 ; SSE2-NEXT: pand %xmm13, %xmm0 1228 ; SSE2-NEXT: por %xmm5, %xmm0 1229 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1230 ; SSE2-NEXT: movdqa %xmm0, %xmm6 1231 ; SSE2-NEXT: pand %xmm5, %xmm6 1232 ; SSE2-NEXT: psllw $2, %xmm6 1233 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1234 ; SSE2-NEXT: pand %xmm8, %xmm6 1235 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1236 ; SSE2-NEXT: pand %xmm9, %xmm0 1237 ; SSE2-NEXT: psrlw $2, %xmm0 1238 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1239 ; SSE2-NEXT: pand %xmm10, %xmm0 1240 ; SSE2-NEXT: por %xmm6, %xmm0 1241 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1242 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1243 ; SSE2-NEXT: pand %xmm6, %xmm4 1244 ; SSE2-NEXT: psrlw $1, %xmm4 1245 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1246 ; SSE2-NEXT: pand %xmm11, %xmm4 1247 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1248 ; SSE2-NEXT: pand %xmm12, %xmm0 1249 ; SSE2-NEXT: paddb %xmm0, %xmm0 1250 ; SSE2-NEXT: por %xmm4, %xmm0 1251 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1252 ; SSE2-NEXT: pand %xmm13, %xmm4 1253 ; SSE2-NEXT: psllw $4, %xmm4 1254 ; SSE2-NEXT: pand %xmm7, %xmm4 1255 ; SSE2-NEXT: pand %xmm7, %xmm1 1256 ; SSE2-NEXT: psrlw $4, %xmm1 1257 ; SSE2-NEXT: pand %xmm13, %xmm1 1258 ; SSE2-NEXT: por %xmm4, %xmm1 1259 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1260 ; SSE2-NEXT: pand %xmm5, %xmm4 1261 ; SSE2-NEXT: psllw $2, %xmm4 1262 ; SSE2-NEXT: pand %xmm8, %xmm4 1263 ; SSE2-NEXT: pand %xmm9, %xmm1 1264 ; SSE2-NEXT: psrlw $2, %xmm1 1265 ; SSE2-NEXT: pand %xmm10, %xmm1 1266 ; SSE2-NEXT: por %xmm4, %xmm1 1267 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1268 ; SSE2-NEXT: pand %xmm6, %xmm4 1269 ; SSE2-NEXT: psrlw $1, %xmm4 1270 ; SSE2-NEXT: pand %xmm11, %xmm4 1271 ; SSE2-NEXT: pand %xmm12, %xmm1 1272 ; SSE2-NEXT: paddb %xmm1, %xmm1 1273 ; SSE2-NEXT: por %xmm4, %xmm1 1274 ; SSE2-NEXT: movdqa %xmm2, %xmm4 1275 ; SSE2-NEXT: pand %xmm13, %xmm4 1276 ; SSE2-NEXT: psllw $4, %xmm4 1277 ; SSE2-NEXT: pand %xmm7, %xmm4 1278 ; SSE2-NEXT: pand %xmm7, %xmm2 1279 ; SSE2-NEXT: psrlw $4, %xmm2 1280 ; SSE2-NEXT: pand %xmm13, %xmm2 1281 ; SSE2-NEXT: por %xmm4, %xmm2 1282 ; SSE2-NEXT: movdqa %xmm2, %xmm4 1283 ; SSE2-NEXT: pand %xmm5, %xmm4 1284 ; SSE2-NEXT: psllw $2, %xmm4 1285 ; SSE2-NEXT: pand %xmm8, %xmm4 1286 ; SSE2-NEXT: pand %xmm9, %xmm2 1287 ; SSE2-NEXT: psrlw $2, %xmm2 1288 ; SSE2-NEXT: pand %xmm10, %xmm2 1289 ; SSE2-NEXT: por %xmm4, %xmm2 1290 ; SSE2-NEXT: movdqa %xmm2, %xmm4 1291 ; SSE2-NEXT: pand %xmm6, %xmm4 1292 ; SSE2-NEXT: psrlw $1, %xmm4 1293 ; SSE2-NEXT: pand %xmm11, %xmm4 1294 ; SSE2-NEXT: pand %xmm12, %xmm2 1295 ; SSE2-NEXT: paddb %xmm2, %xmm2 1296 ; SSE2-NEXT: por %xmm4, %xmm2 1297 ; SSE2-NEXT: movdqa %xmm3, %xmm4 1298 ; SSE2-NEXT: pand %xmm13, %xmm4 1299 ; SSE2-NEXT: psllw $4, %xmm4 1300 ; SSE2-NEXT: pand %xmm7, %xmm4 1301 ; SSE2-NEXT: pand %xmm7, %xmm3 1302 ; SSE2-NEXT: psrlw $4, %xmm3 1303 ; SSE2-NEXT: pand %xmm13, %xmm3 1304 ; SSE2-NEXT: por %xmm4, %xmm3 1305 ; SSE2-NEXT: pand %xmm3, %xmm5 1306 ; SSE2-NEXT: psllw $2, %xmm5 1307 ; SSE2-NEXT: pand %xmm8, %xmm5 1308 ; SSE2-NEXT: pand %xmm9, %xmm3 1309 ; SSE2-NEXT: psrlw $2, %xmm3 1310 ; SSE2-NEXT: pand %xmm10, %xmm3 1311 ; SSE2-NEXT: por %xmm5, %xmm3 1312 ; SSE2-NEXT: pand %xmm3, %xmm6 1313 ; SSE2-NEXT: psrlw $1, %xmm6 1314 ; SSE2-NEXT: pand %xmm11, %xmm6 1315 ; SSE2-NEXT: pand %xmm12, %xmm3 1316 ; SSE2-NEXT: paddb %xmm3, %xmm3 1317 ; SSE2-NEXT: por %xmm6, %xmm3 1318 ; SSE2-NEXT: retq 1319 ; 1320 ; SSSE3-LABEL: test_bitreverse_v64i8: 1321 ; SSSE3: # %bb.0: 1322 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 1323 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1324 ; SSSE3-NEXT: pand %xmm8, %xmm0 1325 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1326 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 1327 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 1328 ; SSSE3-NEXT: psrlw $4, %xmm5 1329 ; SSSE3-NEXT: pand %xmm8, %xmm5 1330 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1331 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 1332 ; SSSE3-NEXT: pshufb %xmm5, %xmm0 1333 ; SSSE3-NEXT: por %xmm6, %xmm0 1334 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 1335 ; SSSE3-NEXT: pand %xmm8, %xmm5 1336 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 1337 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 1338 ; SSSE3-NEXT: psrlw $4, %xmm1 1339 ; SSSE3-NEXT: pand %xmm8, %xmm1 1340 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 1341 ; SSSE3-NEXT: pshufb %xmm1, %xmm5 1342 ; SSSE3-NEXT: por %xmm6, %xmm5 1343 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 1344 ; SSSE3-NEXT: pand %xmm8, %xmm1 1345 ; SSSE3-NEXT: movdqa %xmm9, %xmm7 1346 ; SSSE3-NEXT: pshufb %xmm1, %xmm7 1347 ; SSSE3-NEXT: psrlw $4, %xmm2 1348 ; SSSE3-NEXT: pand %xmm8, %xmm2 1349 ; SSSE3-NEXT: movdqa %xmm4, %xmm6 1350 ; SSSE3-NEXT: pshufb %xmm2, %xmm6 1351 ; SSSE3-NEXT: por %xmm7, %xmm6 1352 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 1353 ; SSSE3-NEXT: pand %xmm8, %xmm1 1354 ; SSSE3-NEXT: pshufb %xmm1, %xmm9 1355 ; SSSE3-NEXT: psrlw $4, %xmm3 1356 ; SSSE3-NEXT: pand %xmm8, %xmm3 1357 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 1358 ; SSSE3-NEXT: por %xmm9, %xmm4 1359 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 1360 ; SSSE3-NEXT: movdqa %xmm6, %xmm2 1361 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 1362 ; SSSE3-NEXT: retq 1363 ; 1364 ; AVX1-LABEL: test_bitreverse_v64i8: 1365 ; AVX1: # %bb.0: 1366 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1367 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1368 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1369 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1370 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1371 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1372 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1373 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1374 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1375 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1376 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4 1377 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1378 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1379 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1380 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1381 ; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 1382 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1383 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1384 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 1385 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1386 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1387 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1388 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 1389 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 1390 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1391 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1392 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1393 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1394 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1395 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1396 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1397 ; AVX1-NEXT: retq 1398 ; 1399 ; AVX2-LABEL: test_bitreverse_v64i8: 1400 ; AVX2: # %bb.0: 1401 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1402 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 1403 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1404 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1405 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1406 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1407 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1408 ; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1409 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 1410 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 1411 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1412 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1413 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1414 ; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1415 ; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 1416 ; AVX2-NEXT: retq 1417 ; 1418 ; AVX512F-LABEL: test_bitreverse_v64i8: 1419 ; AVX512F: # %bb.0: 1420 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1421 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 1422 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1423 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1424 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1425 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 1426 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1427 ; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 1428 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 1429 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 1430 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 1431 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1432 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1433 ; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 1434 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 1435 ; AVX512F-NEXT: retq 1436 ; 1437 ; AVX512BW-LABEL: test_bitreverse_v64i8: 1438 ; AVX512BW: # %bb.0: 1439 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1440 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1441 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1442 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1443 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1444 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1445 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1446 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1447 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1448 ; AVX512BW-NEXT: retq 1449 ; 1450 ; XOPAVX1-LABEL: test_bitreverse_v64i8: 1451 ; XOPAVX1: # %bb.0: 1452 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1453 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1454 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1455 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1456 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1457 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1458 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1459 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1460 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1461 ; XOPAVX1-NEXT: retq 1462 ; 1463 ; XOPAVX2-LABEL: test_bitreverse_v64i8: 1464 ; XOPAVX2: # %bb.0: 1465 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1466 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1467 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1468 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1469 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1470 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1471 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1472 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1473 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1474 ; XOPAVX2-NEXT: retq 1475 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 1476 ret <64 x i8> %b 1477 } 1478 1479 define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 1480 ; SSE2-LABEL: test_bitreverse_v32i16: 1481 ; SSE2: # %bb.0: 1482 ; SSE2-NEXT: pxor %xmm14, %xmm14 1483 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1484 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1485 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 1486 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 1487 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1488 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1489 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 1490 ; SSE2-NEXT: packuswb %xmm4, %xmm0 1491 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1492 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1493 ; SSE2-NEXT: pand %xmm8, %xmm5 1494 ; SSE2-NEXT: psllw $4, %xmm5 1495 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1496 ; SSE2-NEXT: pand %xmm4, %xmm5 1497 ; SSE2-NEXT: pand %xmm4, %xmm0 1498 ; SSE2-NEXT: psrlw $4, %xmm0 1499 ; SSE2-NEXT: pand %xmm8, %xmm0 1500 ; SSE2-NEXT: por %xmm5, %xmm0 1501 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1502 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1503 ; SSE2-NEXT: pand %xmm5, %xmm7 1504 ; SSE2-NEXT: psllw $2, %xmm7 1505 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1506 ; SSE2-NEXT: pand %xmm9, %xmm7 1507 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1508 ; SSE2-NEXT: pand %xmm10, %xmm0 1509 ; SSE2-NEXT: psrlw $2, %xmm0 1510 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1511 ; SSE2-NEXT: pand %xmm11, %xmm0 1512 ; SSE2-NEXT: por %xmm7, %xmm0 1513 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1514 ; SSE2-NEXT: movdqa %xmm0, %xmm6 1515 ; SSE2-NEXT: pand %xmm7, %xmm6 1516 ; SSE2-NEXT: psrlw $1, %xmm6 1517 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1518 ; SSE2-NEXT: pand %xmm12, %xmm6 1519 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1520 ; SSE2-NEXT: pand %xmm13, %xmm0 1521 ; SSE2-NEXT: paddb %xmm0, %xmm0 1522 ; SSE2-NEXT: por %xmm6, %xmm0 1523 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1524 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1525 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1526 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1527 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1528 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1529 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 1530 ; SSE2-NEXT: packuswb %xmm6, %xmm1 1531 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1532 ; SSE2-NEXT: pand %xmm8, %xmm6 1533 ; SSE2-NEXT: psllw $4, %xmm6 1534 ; SSE2-NEXT: pand %xmm4, %xmm6 1535 ; SSE2-NEXT: pand %xmm4, %xmm1 1536 ; SSE2-NEXT: psrlw $4, %xmm1 1537 ; SSE2-NEXT: pand %xmm8, %xmm1 1538 ; SSE2-NEXT: por %xmm6, %xmm1 1539 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1540 ; SSE2-NEXT: pand %xmm5, %xmm6 1541 ; SSE2-NEXT: psllw $2, %xmm6 1542 ; SSE2-NEXT: pand %xmm9, %xmm6 1543 ; SSE2-NEXT: pand %xmm10, %xmm1 1544 ; SSE2-NEXT: psrlw $2, %xmm1 1545 ; SSE2-NEXT: pand %xmm11, %xmm1 1546 ; SSE2-NEXT: por %xmm6, %xmm1 1547 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1548 ; SSE2-NEXT: pand %xmm7, %xmm6 1549 ; SSE2-NEXT: psrlw $1, %xmm6 1550 ; SSE2-NEXT: pand %xmm12, %xmm6 1551 ; SSE2-NEXT: pand %xmm13, %xmm1 1552 ; SSE2-NEXT: paddb %xmm1, %xmm1 1553 ; SSE2-NEXT: por %xmm6, %xmm1 1554 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1555 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1556 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1557 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1558 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1559 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1560 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1561 ; SSE2-NEXT: packuswb %xmm6, %xmm2 1562 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1563 ; SSE2-NEXT: pand %xmm8, %xmm6 1564 ; SSE2-NEXT: psllw $4, %xmm6 1565 ; SSE2-NEXT: pand %xmm4, %xmm6 1566 ; SSE2-NEXT: pand %xmm4, %xmm2 1567 ; SSE2-NEXT: psrlw $4, %xmm2 1568 ; SSE2-NEXT: pand %xmm8, %xmm2 1569 ; SSE2-NEXT: por %xmm6, %xmm2 1570 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1571 ; SSE2-NEXT: pand %xmm5, %xmm6 1572 ; SSE2-NEXT: psllw $2, %xmm6 1573 ; SSE2-NEXT: pand %xmm9, %xmm6 1574 ; SSE2-NEXT: pand %xmm10, %xmm2 1575 ; SSE2-NEXT: psrlw $2, %xmm2 1576 ; SSE2-NEXT: pand %xmm11, %xmm2 1577 ; SSE2-NEXT: por %xmm6, %xmm2 1578 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1579 ; SSE2-NEXT: pand %xmm7, %xmm6 1580 ; SSE2-NEXT: psrlw $1, %xmm6 1581 ; SSE2-NEXT: pand %xmm12, %xmm6 1582 ; SSE2-NEXT: pand %xmm13, %xmm2 1583 ; SSE2-NEXT: paddb %xmm2, %xmm2 1584 ; SSE2-NEXT: por %xmm6, %xmm2 1585 ; SSE2-NEXT: movdqa %xmm3, %xmm6 1586 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1587 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7] 1588 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6] 1589 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1590 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 1591 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 1592 ; SSE2-NEXT: packuswb %xmm6, %xmm3 1593 ; SSE2-NEXT: movdqa %xmm3, %xmm6 1594 ; SSE2-NEXT: pand %xmm8, %xmm6 1595 ; SSE2-NEXT: psllw $4, %xmm6 1596 ; SSE2-NEXT: pand %xmm4, %xmm6 1597 ; SSE2-NEXT: pand %xmm4, %xmm3 1598 ; SSE2-NEXT: psrlw $4, %xmm3 1599 ; SSE2-NEXT: pand %xmm8, %xmm3 1600 ; SSE2-NEXT: por %xmm6, %xmm3 1601 ; SSE2-NEXT: pand %xmm3, %xmm5 1602 ; SSE2-NEXT: psllw $2, %xmm5 1603 ; SSE2-NEXT: pand %xmm9, %xmm5 1604 ; SSE2-NEXT: pand %xmm10, %xmm3 1605 ; SSE2-NEXT: psrlw $2, %xmm3 1606 ; SSE2-NEXT: pand %xmm11, %xmm3 1607 ; SSE2-NEXT: por %xmm5, %xmm3 1608 ; SSE2-NEXT: pand %xmm3, %xmm7 1609 ; SSE2-NEXT: psrlw $1, %xmm7 1610 ; SSE2-NEXT: pand %xmm12, %xmm7 1611 ; SSE2-NEXT: pand %xmm13, %xmm3 1612 ; SSE2-NEXT: paddb %xmm3, %xmm3 1613 ; SSE2-NEXT: por %xmm7, %xmm3 1614 ; SSE2-NEXT: retq 1615 ; 1616 ; SSSE3-LABEL: test_bitreverse_v32i16: 1617 ; SSSE3: # %bb.0: 1618 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 1619 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 1620 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1621 ; SSSE3-NEXT: pshufb %xmm8, %xmm1 1622 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1623 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1624 ; SSSE3-NEXT: pand %xmm9, %xmm0 1625 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1626 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 1627 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 1628 ; SSSE3-NEXT: psrlw $4, %xmm1 1629 ; SSSE3-NEXT: pand %xmm9, %xmm1 1630 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1631 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 1632 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 1633 ; SSSE3-NEXT: por %xmm6, %xmm0 1634 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 1635 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 1636 ; SSSE3-NEXT: pand %xmm9, %xmm1 1637 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 1638 ; SSSE3-NEXT: pshufb %xmm1, %xmm6 1639 ; SSSE3-NEXT: psrlw $4, %xmm5 1640 ; SSSE3-NEXT: pand %xmm9, %xmm5 1641 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 1642 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 1643 ; SSSE3-NEXT: por %xmm6, %xmm1 1644 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 1645 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 1646 ; SSSE3-NEXT: pand %xmm9, %xmm5 1647 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 1648 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 1649 ; SSSE3-NEXT: psrlw $4, %xmm2 1650 ; SSSE3-NEXT: pand %xmm9, %xmm2 1651 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 1652 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 1653 ; SSSE3-NEXT: por %xmm6, %xmm5 1654 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 1655 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 1656 ; SSSE3-NEXT: pand %xmm9, %xmm2 1657 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 1658 ; SSSE3-NEXT: psrlw $4, %xmm3 1659 ; SSSE3-NEXT: pand %xmm9, %xmm3 1660 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 1661 ; SSSE3-NEXT: por %xmm7, %xmm4 1662 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 1663 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 1664 ; SSSE3-NEXT: retq 1665 ; 1666 ; AVX1-LABEL: test_bitreverse_v32i16: 1667 ; AVX1: # %bb.0: 1668 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1669 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1670 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1671 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1672 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1673 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1674 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1675 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1676 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1677 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1678 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1679 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1680 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1681 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1682 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1683 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1684 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1685 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 1686 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 1687 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1688 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1689 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1690 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1691 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1692 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1693 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1694 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1695 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1696 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 1697 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 1698 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 1699 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1700 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1701 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 1702 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1703 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1704 ; AVX1-NEXT: retq 1705 ; 1706 ; AVX2-LABEL: test_bitreverse_v32i16: 1707 ; AVX2: # %bb.0: 1708 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1709 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1710 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1711 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 1712 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1713 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1714 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1715 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 1716 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1717 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1718 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 1719 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1720 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1721 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1722 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 1723 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1724 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1725 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 1726 ; AVX2-NEXT: retq 1727 ; 1728 ; AVX512F-LABEL: test_bitreverse_v32i16: 1729 ; AVX512F: # %bb.0: 1730 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1731 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 1732 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1733 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4 1734 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1735 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 1736 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 1737 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 1738 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1739 ; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 1740 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 1741 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 1742 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2 1743 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 1744 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 1745 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 1746 ; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 1747 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 1748 ; AVX512F-NEXT: retq 1749 ; 1750 ; AVX512BW-LABEL: test_bitreverse_v32i16: 1751 ; AVX512BW: # %bb.0: 1752 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 1753 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1754 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 1755 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1756 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 1757 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 1758 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 1759 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1760 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 1761 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 1762 ; AVX512BW-NEXT: retq 1763 ; 1764 ; XOPAVX1-LABEL: test_bitreverse_v32i16: 1765 ; XOPAVX1: # %bb.0: 1766 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1767 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1768 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1769 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1770 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1771 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1772 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1773 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1774 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1775 ; XOPAVX1-NEXT: retq 1776 ; 1777 ; XOPAVX2-LABEL: test_bitreverse_v32i16: 1778 ; XOPAVX2: # %bb.0: 1779 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1780 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1781 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1782 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 1783 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 1784 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1785 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 1786 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 1787 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1788 ; XOPAVX2-NEXT: retq 1789 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 1790 ret <32 x i16> %b 1791 } 1792 1793 define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 1794 ; SSE2-LABEL: test_bitreverse_v16i32: 1795 ; SSE2: # %bb.0: 1796 ; SSE2-NEXT: pxor %xmm14, %xmm14 1797 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1798 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 1799 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 1800 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 1801 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 1802 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1803 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1804 ; SSE2-NEXT: packuswb %xmm4, %xmm0 1805 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1806 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1807 ; SSE2-NEXT: pand %xmm8, %xmm5 1808 ; SSE2-NEXT: psllw $4, %xmm5 1809 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1810 ; SSE2-NEXT: pand %xmm4, %xmm5 1811 ; SSE2-NEXT: pand %xmm4, %xmm0 1812 ; SSE2-NEXT: psrlw $4, %xmm0 1813 ; SSE2-NEXT: pand %xmm8, %xmm0 1814 ; SSE2-NEXT: por %xmm5, %xmm0 1815 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 1816 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1817 ; SSE2-NEXT: pand %xmm5, %xmm7 1818 ; SSE2-NEXT: psllw $2, %xmm7 1819 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 1820 ; SSE2-NEXT: pand %xmm9, %xmm7 1821 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 1822 ; SSE2-NEXT: pand %xmm10, %xmm0 1823 ; SSE2-NEXT: psrlw $2, %xmm0 1824 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 1825 ; SSE2-NEXT: pand %xmm11, %xmm0 1826 ; SSE2-NEXT: por %xmm7, %xmm0 1827 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 1828 ; SSE2-NEXT: movdqa %xmm0, %xmm6 1829 ; SSE2-NEXT: pand %xmm7, %xmm6 1830 ; SSE2-NEXT: psrlw $1, %xmm6 1831 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1832 ; SSE2-NEXT: pand %xmm12, %xmm6 1833 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 1834 ; SSE2-NEXT: pand %xmm13, %xmm0 1835 ; SSE2-NEXT: paddb %xmm0, %xmm0 1836 ; SSE2-NEXT: por %xmm6, %xmm0 1837 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1838 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1839 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1840 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1841 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 1842 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1843 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1844 ; SSE2-NEXT: packuswb %xmm6, %xmm1 1845 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1846 ; SSE2-NEXT: pand %xmm8, %xmm6 1847 ; SSE2-NEXT: psllw $4, %xmm6 1848 ; SSE2-NEXT: pand %xmm4, %xmm6 1849 ; SSE2-NEXT: pand %xmm4, %xmm1 1850 ; SSE2-NEXT: psrlw $4, %xmm1 1851 ; SSE2-NEXT: pand %xmm8, %xmm1 1852 ; SSE2-NEXT: por %xmm6, %xmm1 1853 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1854 ; SSE2-NEXT: pand %xmm5, %xmm6 1855 ; SSE2-NEXT: psllw $2, %xmm6 1856 ; SSE2-NEXT: pand %xmm9, %xmm6 1857 ; SSE2-NEXT: pand %xmm10, %xmm1 1858 ; SSE2-NEXT: psrlw $2, %xmm1 1859 ; SSE2-NEXT: pand %xmm11, %xmm1 1860 ; SSE2-NEXT: por %xmm6, %xmm1 1861 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1862 ; SSE2-NEXT: pand %xmm7, %xmm6 1863 ; SSE2-NEXT: psrlw $1, %xmm6 1864 ; SSE2-NEXT: pand %xmm12, %xmm6 1865 ; SSE2-NEXT: pand %xmm13, %xmm1 1866 ; SSE2-NEXT: paddb %xmm1, %xmm1 1867 ; SSE2-NEXT: por %xmm6, %xmm1 1868 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1869 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1870 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1871 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1872 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 1873 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1874 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1875 ; SSE2-NEXT: packuswb %xmm6, %xmm2 1876 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1877 ; SSE2-NEXT: pand %xmm8, %xmm6 1878 ; SSE2-NEXT: psllw $4, %xmm6 1879 ; SSE2-NEXT: pand %xmm4, %xmm6 1880 ; SSE2-NEXT: pand %xmm4, %xmm2 1881 ; SSE2-NEXT: psrlw $4, %xmm2 1882 ; SSE2-NEXT: pand %xmm8, %xmm2 1883 ; SSE2-NEXT: por %xmm6, %xmm2 1884 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1885 ; SSE2-NEXT: pand %xmm5, %xmm6 1886 ; SSE2-NEXT: psllw $2, %xmm6 1887 ; SSE2-NEXT: pand %xmm9, %xmm6 1888 ; SSE2-NEXT: pand %xmm10, %xmm2 1889 ; SSE2-NEXT: psrlw $2, %xmm2 1890 ; SSE2-NEXT: pand %xmm11, %xmm2 1891 ; SSE2-NEXT: por %xmm6, %xmm2 1892 ; SSE2-NEXT: movdqa %xmm2, %xmm6 1893 ; SSE2-NEXT: pand %xmm7, %xmm6 1894 ; SSE2-NEXT: psrlw $1, %xmm6 1895 ; SSE2-NEXT: pand %xmm12, %xmm6 1896 ; SSE2-NEXT: pand %xmm13, %xmm2 1897 ; SSE2-NEXT: paddb %xmm2, %xmm2 1898 ; SSE2-NEXT: por %xmm6, %xmm2 1899 ; SSE2-NEXT: movdqa %xmm3, %xmm6 1900 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 1901 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 1902 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 1903 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 1904 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1905 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1906 ; SSE2-NEXT: packuswb %xmm6, %xmm3 1907 ; SSE2-NEXT: movdqa %xmm3, %xmm6 1908 ; SSE2-NEXT: pand %xmm8, %xmm6 1909 ; SSE2-NEXT: psllw $4, %xmm6 1910 ; SSE2-NEXT: pand %xmm4, %xmm6 1911 ; SSE2-NEXT: pand %xmm4, %xmm3 1912 ; SSE2-NEXT: psrlw $4, %xmm3 1913 ; SSE2-NEXT: pand %xmm8, %xmm3 1914 ; SSE2-NEXT: por %xmm6, %xmm3 1915 ; SSE2-NEXT: pand %xmm3, %xmm5 1916 ; SSE2-NEXT: psllw $2, %xmm5 1917 ; SSE2-NEXT: pand %xmm9, %xmm5 1918 ; SSE2-NEXT: pand %xmm10, %xmm3 1919 ; SSE2-NEXT: psrlw $2, %xmm3 1920 ; SSE2-NEXT: pand %xmm11, %xmm3 1921 ; SSE2-NEXT: por %xmm5, %xmm3 1922 ; SSE2-NEXT: pand %xmm3, %xmm7 1923 ; SSE2-NEXT: psrlw $1, %xmm7 1924 ; SSE2-NEXT: pand %xmm12, %xmm7 1925 ; SSE2-NEXT: pand %xmm13, %xmm3 1926 ; SSE2-NEXT: paddb %xmm3, %xmm3 1927 ; SSE2-NEXT: por %xmm7, %xmm3 1928 ; SSE2-NEXT: retq 1929 ; 1930 ; SSSE3-LABEL: test_bitreverse_v16i32: 1931 ; SSSE3: # %bb.0: 1932 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 1933 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 1934 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1935 ; SSSE3-NEXT: pshufb %xmm8, %xmm1 1936 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1937 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1938 ; SSSE3-NEXT: pand %xmm9, %xmm0 1939 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1940 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 1941 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 1942 ; SSSE3-NEXT: psrlw $4, %xmm1 1943 ; SSSE3-NEXT: pand %xmm9, %xmm1 1944 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1945 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 1946 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 1947 ; SSSE3-NEXT: por %xmm6, %xmm0 1948 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 1949 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 1950 ; SSSE3-NEXT: pand %xmm9, %xmm1 1951 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 1952 ; SSSE3-NEXT: pshufb %xmm1, %xmm6 1953 ; SSSE3-NEXT: psrlw $4, %xmm5 1954 ; SSSE3-NEXT: pand %xmm9, %xmm5 1955 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 1956 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 1957 ; SSSE3-NEXT: por %xmm6, %xmm1 1958 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 1959 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 1960 ; SSSE3-NEXT: pand %xmm9, %xmm5 1961 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 1962 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 1963 ; SSSE3-NEXT: psrlw $4, %xmm2 1964 ; SSSE3-NEXT: pand %xmm9, %xmm2 1965 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 1966 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 1967 ; SSSE3-NEXT: por %xmm6, %xmm5 1968 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 1969 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 1970 ; SSSE3-NEXT: pand %xmm9, %xmm2 1971 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 1972 ; SSSE3-NEXT: psrlw $4, %xmm3 1973 ; SSSE3-NEXT: pand %xmm9, %xmm3 1974 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 1975 ; SSSE3-NEXT: por %xmm7, %xmm4 1976 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 1977 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 1978 ; SSSE3-NEXT: retq 1979 ; 1980 ; AVX1-LABEL: test_bitreverse_v16i32: 1981 ; AVX1: # %bb.0: 1982 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1983 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1984 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1985 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1986 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 1987 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1988 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1989 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 1990 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 1991 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1992 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 1993 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 1994 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 1995 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 1996 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 1997 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1998 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1999 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2000 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2001 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2002 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2003 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2004 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2005 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2006 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2007 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2008 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2009 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2010 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2011 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2012 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2013 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2014 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2015 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2016 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2017 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2018 ; AVX1-NEXT: retq 2019 ; 2020 ; AVX2-LABEL: test_bitreverse_v16i32: 2021 ; AVX2: # %bb.0: 2022 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2023 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2024 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2025 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2026 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2027 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2028 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2029 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2030 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2031 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2032 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2033 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2034 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2035 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2036 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2037 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2038 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2039 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2040 ; AVX2-NEXT: retq 2041 ; 2042 ; AVX512F-LABEL: test_bitreverse_v16i32: 2043 ; AVX512F: # %bb.0: 2044 ; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 2045 ; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2 2046 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 2047 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2048 ; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 2049 ; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0 2050 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2051 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2052 ; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0 2053 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2054 ; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 2055 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2056 ; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0 2057 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2058 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2059 ; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1 2060 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2061 ; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0 2062 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2063 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 2064 ; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1 2065 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 2066 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 2067 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2068 ; AVX512F-NEXT: retq 2069 ; 2070 ; AVX512BW-LABEL: test_bitreverse_v16i32: 2071 ; AVX512BW: # %bb.0: 2072 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 2073 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2074 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2075 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2076 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2077 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2078 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2079 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2080 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2081 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2082 ; AVX512BW-NEXT: retq 2083 ; 2084 ; XOPAVX1-LABEL: test_bitreverse_v16i32: 2085 ; XOPAVX1: # %bb.0: 2086 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2087 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2088 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2089 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2090 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2091 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2092 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2093 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2094 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2095 ; XOPAVX1-NEXT: retq 2096 ; 2097 ; XOPAVX2-LABEL: test_bitreverse_v16i32: 2098 ; XOPAVX2: # %bb.0: 2099 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2100 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 2101 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2102 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2103 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2104 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2105 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2106 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2107 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2108 ; XOPAVX2-NEXT: retq 2109 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 2110 ret <16 x i32> %b 2111 } 2112 2113 define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 2114 ; SSE2-LABEL: test_bitreverse_v8i64: 2115 ; SSE2: # %bb.0: 2116 ; SSE2-NEXT: pxor %xmm14, %xmm14 2117 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2118 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15] 2119 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2120 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2121 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2122 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] 2123 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 2124 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2125 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2126 ; SSE2-NEXT: packuswb %xmm4, %xmm0 2127 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2128 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2129 ; SSE2-NEXT: pand %xmm8, %xmm5 2130 ; SSE2-NEXT: psllw $4, %xmm5 2131 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 2132 ; SSE2-NEXT: pand %xmm4, %xmm5 2133 ; SSE2-NEXT: pand %xmm4, %xmm0 2134 ; SSE2-NEXT: psrlw $4, %xmm0 2135 ; SSE2-NEXT: pand %xmm8, %xmm0 2136 ; SSE2-NEXT: por %xmm5, %xmm0 2137 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] 2138 ; SSE2-NEXT: movdqa %xmm0, %xmm7 2139 ; SSE2-NEXT: pand %xmm5, %xmm7 2140 ; SSE2-NEXT: psllw $2, %xmm7 2141 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 2142 ; SSE2-NEXT: pand %xmm9, %xmm7 2143 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204] 2144 ; SSE2-NEXT: pand %xmm10, %xmm0 2145 ; SSE2-NEXT: psrlw $2, %xmm0 2146 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 2147 ; SSE2-NEXT: pand %xmm11, %xmm0 2148 ; SSE2-NEXT: por %xmm7, %xmm0 2149 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170] 2150 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2151 ; SSE2-NEXT: pand %xmm7, %xmm6 2152 ; SSE2-NEXT: psrlw $1, %xmm6 2153 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2154 ; SSE2-NEXT: pand %xmm12, %xmm6 2155 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85] 2156 ; SSE2-NEXT: pand %xmm13, %xmm0 2157 ; SSE2-NEXT: paddb %xmm0, %xmm0 2158 ; SSE2-NEXT: por %xmm6, %xmm0 2159 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2160 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2161 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2162 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2163 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2164 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] 2165 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2166 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2167 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2168 ; SSE2-NEXT: packuswb %xmm6, %xmm1 2169 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2170 ; SSE2-NEXT: pand %xmm8, %xmm6 2171 ; SSE2-NEXT: psllw $4, %xmm6 2172 ; SSE2-NEXT: pand %xmm4, %xmm6 2173 ; SSE2-NEXT: pand %xmm4, %xmm1 2174 ; SSE2-NEXT: psrlw $4, %xmm1 2175 ; SSE2-NEXT: pand %xmm8, %xmm1 2176 ; SSE2-NEXT: por %xmm6, %xmm1 2177 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2178 ; SSE2-NEXT: pand %xmm5, %xmm6 2179 ; SSE2-NEXT: psllw $2, %xmm6 2180 ; SSE2-NEXT: pand %xmm9, %xmm6 2181 ; SSE2-NEXT: pand %xmm10, %xmm1 2182 ; SSE2-NEXT: psrlw $2, %xmm1 2183 ; SSE2-NEXT: pand %xmm11, %xmm1 2184 ; SSE2-NEXT: por %xmm6, %xmm1 2185 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2186 ; SSE2-NEXT: pand %xmm7, %xmm6 2187 ; SSE2-NEXT: psrlw $1, %xmm6 2188 ; SSE2-NEXT: pand %xmm12, %xmm6 2189 ; SSE2-NEXT: pand %xmm13, %xmm1 2190 ; SSE2-NEXT: paddb %xmm1, %xmm1 2191 ; SSE2-NEXT: por %xmm6, %xmm1 2192 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2193 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2194 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2195 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2196 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2197 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] 2198 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 2199 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2200 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2201 ; SSE2-NEXT: packuswb %xmm6, %xmm2 2202 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2203 ; SSE2-NEXT: pand %xmm8, %xmm6 2204 ; SSE2-NEXT: psllw $4, %xmm6 2205 ; SSE2-NEXT: pand %xmm4, %xmm6 2206 ; SSE2-NEXT: pand %xmm4, %xmm2 2207 ; SSE2-NEXT: psrlw $4, %xmm2 2208 ; SSE2-NEXT: pand %xmm8, %xmm2 2209 ; SSE2-NEXT: por %xmm6, %xmm2 2210 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2211 ; SSE2-NEXT: pand %xmm5, %xmm6 2212 ; SSE2-NEXT: psllw $2, %xmm6 2213 ; SSE2-NEXT: pand %xmm9, %xmm6 2214 ; SSE2-NEXT: pand %xmm10, %xmm2 2215 ; SSE2-NEXT: psrlw $2, %xmm2 2216 ; SSE2-NEXT: pand %xmm11, %xmm2 2217 ; SSE2-NEXT: por %xmm6, %xmm2 2218 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2219 ; SSE2-NEXT: pand %xmm7, %xmm6 2220 ; SSE2-NEXT: psrlw $1, %xmm6 2221 ; SSE2-NEXT: pand %xmm12, %xmm6 2222 ; SSE2-NEXT: pand %xmm13, %xmm2 2223 ; SSE2-NEXT: paddb %xmm2, %xmm2 2224 ; SSE2-NEXT: por %xmm6, %xmm2 2225 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2226 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] 2227 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] 2228 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] 2229 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] 2230 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] 2231 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 2232 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2233 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2234 ; SSE2-NEXT: packuswb %xmm6, %xmm3 2235 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2236 ; SSE2-NEXT: pand %xmm8, %xmm6 2237 ; SSE2-NEXT: psllw $4, %xmm6 2238 ; SSE2-NEXT: pand %xmm4, %xmm6 2239 ; SSE2-NEXT: pand %xmm4, %xmm3 2240 ; SSE2-NEXT: psrlw $4, %xmm3 2241 ; SSE2-NEXT: pand %xmm8, %xmm3 2242 ; SSE2-NEXT: por %xmm6, %xmm3 2243 ; SSE2-NEXT: pand %xmm3, %xmm5 2244 ; SSE2-NEXT: psllw $2, %xmm5 2245 ; SSE2-NEXT: pand %xmm9, %xmm5 2246 ; SSE2-NEXT: pand %xmm10, %xmm3 2247 ; SSE2-NEXT: psrlw $2, %xmm3 2248 ; SSE2-NEXT: pand %xmm11, %xmm3 2249 ; SSE2-NEXT: por %xmm5, %xmm3 2250 ; SSE2-NEXT: pand %xmm3, %xmm7 2251 ; SSE2-NEXT: psrlw $1, %xmm7 2252 ; SSE2-NEXT: pand %xmm12, %xmm7 2253 ; SSE2-NEXT: pand %xmm13, %xmm3 2254 ; SSE2-NEXT: paddb %xmm3, %xmm3 2255 ; SSE2-NEXT: por %xmm7, %xmm3 2256 ; SSE2-NEXT: retq 2257 ; 2258 ; SSSE3-LABEL: test_bitreverse_v8i64: 2259 ; SSSE3: # %bb.0: 2260 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 2261 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 2262 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2263 ; SSSE3-NEXT: pshufb %xmm8, %xmm1 2264 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2265 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 2266 ; SSSE3-NEXT: pand %xmm9, %xmm0 2267 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2268 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2269 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 2270 ; SSSE3-NEXT: psrlw $4, %xmm1 2271 ; SSSE3-NEXT: pand %xmm9, %xmm1 2272 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2273 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 2274 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 2275 ; SSSE3-NEXT: por %xmm6, %xmm0 2276 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 2277 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 2278 ; SSSE3-NEXT: pand %xmm9, %xmm1 2279 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2280 ; SSSE3-NEXT: pshufb %xmm1, %xmm6 2281 ; SSSE3-NEXT: psrlw $4, %xmm5 2282 ; SSSE3-NEXT: pand %xmm9, %xmm5 2283 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 2284 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 2285 ; SSSE3-NEXT: por %xmm6, %xmm1 2286 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 2287 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 2288 ; SSSE3-NEXT: pand %xmm9, %xmm5 2289 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2290 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 2291 ; SSSE3-NEXT: psrlw $4, %xmm2 2292 ; SSSE3-NEXT: pand %xmm9, %xmm2 2293 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 2294 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 2295 ; SSSE3-NEXT: por %xmm6, %xmm5 2296 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 2297 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 2298 ; SSSE3-NEXT: pand %xmm9, %xmm2 2299 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 2300 ; SSSE3-NEXT: psrlw $4, %xmm3 2301 ; SSSE3-NEXT: pand %xmm9, %xmm3 2302 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 2303 ; SSSE3-NEXT: por %xmm7, %xmm4 2304 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 2305 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 2306 ; SSSE3-NEXT: retq 2307 ; 2308 ; AVX1-LABEL: test_bitreverse_v8i64: 2309 ; AVX1: # %bb.0: 2310 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2311 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2312 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2313 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2314 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2315 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2316 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2317 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2318 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2319 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2320 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2321 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2322 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2323 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2324 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2325 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2326 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2327 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2328 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2329 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2330 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2331 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2332 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2333 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2334 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2335 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2336 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2337 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2338 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2339 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2340 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2341 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2342 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2343 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2344 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2345 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2346 ; AVX1-NEXT: retq 2347 ; 2348 ; AVX2-LABEL: test_bitreverse_v8i64: 2349 ; AVX2: # %bb.0: 2350 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2351 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2352 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2353 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2354 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2355 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2356 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2357 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2358 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2359 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2360 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2361 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2362 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2363 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2364 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2365 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2366 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2367 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2368 ; AVX2-NEXT: retq 2369 ; 2370 ; AVX512F-LABEL: test_bitreverse_v8i64: 2371 ; AVX512F: # %bb.0: 2372 ; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm1 2373 ; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm2 2374 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2375 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2376 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 2377 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2378 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2379 ; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 2380 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2381 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 2382 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 2383 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 2384 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 2385 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 2386 ; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2 2387 ; AVX512F-NEXT: vpsllq $56, %zmm0, %zmm3 2388 ; AVX512F-NEXT: vpsllq $40, %zmm0, %zmm0 2389 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2390 ; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 2391 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2392 ; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 2393 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2394 ; AVX512F-NEXT: vpsllq $4, %zmm1, %zmm1 2395 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2396 ; AVX512F-NEXT: vpsrlq $4, %zmm0, %zmm0 2397 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2398 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2399 ; AVX512F-NEXT: vpsllq $2, %zmm1, %zmm1 2400 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2401 ; AVX512F-NEXT: vpsrlq $2, %zmm0, %zmm0 2402 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2403 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 2404 ; AVX512F-NEXT: vpsllq $1, %zmm1, %zmm1 2405 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 2406 ; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0 2407 ; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 2408 ; AVX512F-NEXT: retq 2409 ; 2410 ; AVX512BW-LABEL: test_bitreverse_v8i64: 2411 ; AVX512BW: # %bb.0: 2412 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 2413 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2414 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2415 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2416 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2417 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2418 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2419 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2420 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2421 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2422 ; AVX512BW-NEXT: retq 2423 ; 2424 ; XOPAVX1-LABEL: test_bitreverse_v8i64: 2425 ; XOPAVX1: # %bb.0: 2426 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2427 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2428 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2429 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2430 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2431 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2432 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2433 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2434 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2435 ; XOPAVX1-NEXT: retq 2436 ; 2437 ; XOPAVX2-LABEL: test_bitreverse_v8i64: 2438 ; XOPAVX2: # %bb.0: 2439 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2440 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2441 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2442 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2443 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2444 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2445 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2446 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2447 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2448 ; XOPAVX2-NEXT: retq 2449 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 2450 ret <8 x i64> %b 2451 } 2452 2453 ; 2454 ; Constant Folding 2455 ; 2456 2457 define i32 @fold_bitreverse_i32() nounwind { 2458 ; ALL-LABEL: fold_bitreverse_i32: 2459 ; ALL: # %bb.0: 2460 ; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF 2461 ; ALL-NEXT: retq 2462 %b = call i32 @llvm.bitreverse.i32(i32 4278255360) 2463 ret i32 %b 2464 } 2465 2466 define <16 x i8> @fold_bitreverse_v16i8() nounwind { 2467 ; SSE-LABEL: fold_bitreverse_v16i8: 2468 ; SSE: # %bb.0: 2469 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2470 ; SSE-NEXT: retq 2471 ; 2472 ; AVX-LABEL: fold_bitreverse_v16i8: 2473 ; AVX: # %bb.0: 2474 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2475 ; AVX-NEXT: retq 2476 ; 2477 ; XOP-LABEL: fold_bitreverse_v16i8: 2478 ; XOP: # %bb.0: 2479 ; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143] 2480 ; XOP-NEXT: retq 2481 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>) 2482 ret <16 x i8> %b 2483 } 2484 2485 define <16 x i16> @fold_bitreverse_v16i16() nounwind { 2486 ; SSE-LABEL: fold_bitreverse_v16i16: 2487 ; SSE: # %bb.0: 2488 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959] 2489 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863] 2490 ; SSE-NEXT: retq 2491 ; 2492 ; AVX-LABEL: fold_bitreverse_v16i16: 2493 ; AVX: # %bb.0: 2494 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2495 ; AVX-NEXT: retq 2496 ; 2497 ; XOP-LABEL: fold_bitreverse_v16i16: 2498 ; XOP: # %bb.0: 2499 ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863] 2500 ; XOP-NEXT: retq 2501 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>) 2502 ret <16 x i16> %b 2503 } 2504 2505 define <16 x i32> @fold_bitreverse_v16i32() nounwind { 2506 ; SSE-LABEL: fold_bitreverse_v16i32: 2507 ; SSE: # %bb.0: 2508 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471] 2509 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559] 2510 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015] 2511 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103] 2512 ; SSE-NEXT: retq 2513 ; 2514 ; AVX1-LABEL: fold_bitreverse_v16i32: 2515 ; AVX1: # %bb.0: 2516 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2517 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2518 ; AVX1-NEXT: retq 2519 ; 2520 ; AVX2-LABEL: fold_bitreverse_v16i32: 2521 ; AVX2: # %bb.0: 2522 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2523 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2524 ; AVX2-NEXT: retq 2525 ; 2526 ; AVX512-LABEL: fold_bitreverse_v16i32: 2527 ; AVX512: # %bb.0: 2528 ; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2529 ; AVX512-NEXT: retq 2530 ; 2531 ; XOP-LABEL: fold_bitreverse_v16i32: 2532 ; XOP: # %bb.0: 2533 ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559] 2534 ; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103] 2535 ; XOP-NEXT: retq 2536 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>) 2537 ret <16 x i32> %b 2538 } 2539 2540 declare i8 @llvm.bitreverse.i8(i8) readnone 2541 declare i16 @llvm.bitreverse.i16(i16) readnone 2542 declare i32 @llvm.bitreverse.i32(i32) readnone 2543 declare i64 @llvm.bitreverse.i64(i64) readnone 2544 2545 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 2546 declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 2547 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 2548 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 2549 2550 declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 2551 declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 2552 declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 2553 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 2554 2555 declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 2556 declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 2557 declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 2558 declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 2559