1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 10 11 define i8 @test_bitreverse_i8(i8 %a) nounwind { 12 ; SSE-LABEL: test_bitreverse_i8: 13 ; SSE: # BB#0: 14 ; SSE-NEXT: movl %edi, %eax 15 ; SSE-NEXT: shlb $7, %al 16 ; SSE-NEXT: movl %edi, %ecx 17 ; SSE-NEXT: shlb $5, %cl 18 ; SSE-NEXT: andb $64, %cl 19 ; SSE-NEXT: movl %edi, %edx 20 ; SSE-NEXT: shlb $3, %dl 21 ; SSE-NEXT: andb $32, %dl 22 ; SSE-NEXT: orb %cl, %dl 23 ; SSE-NEXT: movl %edi, %ecx 24 ; SSE-NEXT: addb %cl, %cl 25 ; SSE-NEXT: andb $16, %cl 26 ; SSE-NEXT: orb %dl, %cl 27 ; SSE-NEXT: movl %edi, %edx 28 ; SSE-NEXT: shrb %dl 29 ; SSE-NEXT: andb $8, %dl 30 ; SSE-NEXT: orb %cl, %dl 31 ; SSE-NEXT: movl %edi, %ecx 32 ; SSE-NEXT: shrb $3, %cl 33 ; SSE-NEXT: andb $4, %cl 34 ; SSE-NEXT: orb %dl, %cl 35 ; SSE-NEXT: movl %edi, %edx 36 ; SSE-NEXT: shrb $5, %dl 37 ; SSE-NEXT: andb $2, %dl 38 ; SSE-NEXT: orb %cl, %dl 39 ; SSE-NEXT: shrb $7, %dil 40 ; SSE-NEXT: orb %dl, %dil 41 ; SSE-NEXT: orb %al, %dil 42 ; SSE-NEXT: movl %edi, %eax 43 ; SSE-NEXT: retq 44 ; 45 ; AVX-LABEL: test_bitreverse_i8: 46 ; AVX: # BB#0: 47 ; AVX-NEXT: movl %edi, %eax 48 ; AVX-NEXT: shlb $7, %al 49 ; AVX-NEXT: movl %edi, %ecx 50 ; AVX-NEXT: shlb $5, %cl 51 ; AVX-NEXT: andb $64, %cl 52 ; AVX-NEXT: movl %edi, %edx 53 ; AVX-NEXT: shlb $3, %dl 54 ; AVX-NEXT: andb $32, %dl 55 ; AVX-NEXT: orb %cl, %dl 56 ; AVX-NEXT: movl %edi, %ecx 57 ; AVX-NEXT: addb %cl, %cl 58 ; AVX-NEXT: andb $16, %cl 59 ; AVX-NEXT: orb %dl, %cl 60 ; AVX-NEXT: movl %edi, %edx 61 ; AVX-NEXT: shrb %dl 62 ; AVX-NEXT: andb $8, %dl 63 ; AVX-NEXT: orb %cl, %dl 64 ; AVX-NEXT: movl %edi, %ecx 65 ; AVX-NEXT: shrb $3, %cl 66 ; AVX-NEXT: andb $4, %cl 67 ; AVX-NEXT: orb %dl, %cl 68 ; AVX-NEXT: movl %edi, %edx 69 ; AVX-NEXT: shrb $5, %dl 70 ; AVX-NEXT: andb $2, %dl 71 ; AVX-NEXT: orb %cl, %dl 72 ; AVX-NEXT: shrb $7, %dil 73 ; AVX-NEXT: orb %dl, %dil 74 ; AVX-NEXT: orb %al, %dil 75 ; AVX-NEXT: movl %edi, %eax 76 ; AVX-NEXT: retq 77 ; 78 ; XOP-LABEL: test_bitreverse_i8: 79 ; XOP: # BB#0: 80 ; XOP-NEXT: vmovd %edi, %xmm0 81 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 82 ; XOP-NEXT: vpextrb $0, %xmm0, %eax 83 ; XOP-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 84 ; XOP-NEXT: retq 85 %b = call i8 @llvm.bitreverse.i8(i8 %a) 86 ret i8 %b 87 } 88 89 define i16 @test_bitreverse_i16(i16 %a) nounwind { 90 ; SSE-LABEL: test_bitreverse_i16: 91 ; SSE: # BB#0: 92 ; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 93 ; SSE-NEXT: movl %edi, %ecx 94 ; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 95 ; SSE-NEXT: movl %edi, %eax 96 ; SSE-NEXT: shll $15, %eax 97 ; SSE-NEXT: movl %edi, %edx 98 ; SSE-NEXT: andl $2, %edx 99 ; SSE-NEXT: shll $13, %edx 100 ; SSE-NEXT: leal (%rdx,%rax), %eax 101 ; SSE-NEXT: movl %edi, %edx 102 ; SSE-NEXT: andl $4, %edx 103 ; SSE-NEXT: shll $11, %edx 104 ; SSE-NEXT: orl %edx, %eax 105 ; SSE-NEXT: movl %edi, %edx 106 ; SSE-NEXT: andl $8, %edx 107 ; SSE-NEXT: shll $9, %edx 108 ; SSE-NEXT: orl %edx, %eax 109 ; SSE-NEXT: movl %edi, %edx 110 ; SSE-NEXT: andl $16, %edx 111 ; SSE-NEXT: shll $7, %edx 112 ; SSE-NEXT: orl %edx, %eax 113 ; SSE-NEXT: movl %edi, %edx 114 ; SSE-NEXT: andl $32, %edx 115 ; SSE-NEXT: shll $5, %edx 116 ; SSE-NEXT: orl %edx, %eax 117 ; SSE-NEXT: movl %edi, %edx 118 ; SSE-NEXT: andl $64, %edx 119 ; SSE-NEXT: shll $3, %edx 120 ; SSE-NEXT: leal (%rdi,%rdi), %esi 121 ; SSE-NEXT: andl $256, %esi # imm = 0x100 122 ; SSE-NEXT: orl %edx, %esi 123 ; SSE-NEXT: movl %edi, %edx 124 ; SSE-NEXT: shrl %edx 125 ; SSE-NEXT: andl $128, %edx 126 ; SSE-NEXT: orl %esi, %edx 127 ; SSE-NEXT: movl %edi, %esi 128 ; SSE-NEXT: shrl $3, %esi 129 ; SSE-NEXT: andl $64, %esi 130 ; SSE-NEXT: orl %edx, %esi 131 ; SSE-NEXT: movl %edi, %edx 132 ; SSE-NEXT: shrl $5, %edx 133 ; SSE-NEXT: andl $32, %edx 134 ; SSE-NEXT: orl %esi, %edx 135 ; SSE-NEXT: movl %edi, %esi 136 ; SSE-NEXT: shrl $7, %esi 137 ; SSE-NEXT: andl $16, %esi 138 ; SSE-NEXT: orl %edx, %esi 139 ; SSE-NEXT: movl %edi, %edx 140 ; SSE-NEXT: shrl $9, %edx 141 ; SSE-NEXT: andl $8, %edx 142 ; SSE-NEXT: orl %esi, %edx 143 ; SSE-NEXT: movl %edi, %esi 144 ; SSE-NEXT: shrl $11, %esi 145 ; SSE-NEXT: andl $4, %esi 146 ; SSE-NEXT: orl %edx, %esi 147 ; SSE-NEXT: shrl $13, %edi 148 ; SSE-NEXT: andl $2, %edi 149 ; SSE-NEXT: orl %esi, %edi 150 ; SSE-NEXT: shrl $15, %ecx 151 ; SSE-NEXT: orl %edi, %ecx 152 ; SSE-NEXT: orl %ecx, %eax 153 ; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 154 ; SSE-NEXT: retq 155 ; 156 ; AVX-LABEL: test_bitreverse_i16: 157 ; AVX: # BB#0: 158 ; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 159 ; AVX-NEXT: movl %edi, %ecx 160 ; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 161 ; AVX-NEXT: movl %edi, %eax 162 ; AVX-NEXT: shll $15, %eax 163 ; AVX-NEXT: movl %edi, %edx 164 ; AVX-NEXT: andl $2, %edx 165 ; AVX-NEXT: shll $13, %edx 166 ; AVX-NEXT: leal (%rdx,%rax), %eax 167 ; AVX-NEXT: movl %edi, %edx 168 ; AVX-NEXT: andl $4, %edx 169 ; AVX-NEXT: shll $11, %edx 170 ; AVX-NEXT: orl %edx, %eax 171 ; AVX-NEXT: movl %edi, %edx 172 ; AVX-NEXT: andl $8, %edx 173 ; AVX-NEXT: shll $9, %edx 174 ; AVX-NEXT: orl %edx, %eax 175 ; AVX-NEXT: movl %edi, %edx 176 ; AVX-NEXT: andl $16, %edx 177 ; AVX-NEXT: shll $7, %edx 178 ; AVX-NEXT: orl %edx, %eax 179 ; AVX-NEXT: movl %edi, %edx 180 ; AVX-NEXT: andl $32, %edx 181 ; AVX-NEXT: shll $5, %edx 182 ; AVX-NEXT: orl %edx, %eax 183 ; AVX-NEXT: movl %edi, %edx 184 ; AVX-NEXT: andl $64, %edx 185 ; AVX-NEXT: shll $3, %edx 186 ; AVX-NEXT: leal (%rdi,%rdi), %esi 187 ; AVX-NEXT: andl $256, %esi # imm = 0x100 188 ; AVX-NEXT: orl %edx, %esi 189 ; AVX-NEXT: movl %edi, %edx 190 ; AVX-NEXT: shrl %edx 191 ; AVX-NEXT: andl $128, %edx 192 ; AVX-NEXT: orl %esi, %edx 193 ; AVX-NEXT: movl %edi, %esi 194 ; AVX-NEXT: shrl $3, %esi 195 ; AVX-NEXT: andl $64, %esi 196 ; AVX-NEXT: orl %edx, %esi 197 ; AVX-NEXT: movl %edi, %edx 198 ; AVX-NEXT: shrl $5, %edx 199 ; AVX-NEXT: andl $32, %edx 200 ; AVX-NEXT: orl %esi, %edx 201 ; AVX-NEXT: movl %edi, %esi 202 ; AVX-NEXT: shrl $7, %esi 203 ; AVX-NEXT: andl $16, %esi 204 ; AVX-NEXT: orl %edx, %esi 205 ; AVX-NEXT: movl %edi, %edx 206 ; AVX-NEXT: shrl $9, %edx 207 ; AVX-NEXT: andl $8, %edx 208 ; AVX-NEXT: orl %esi, %edx 209 ; AVX-NEXT: movl %edi, %esi 210 ; AVX-NEXT: shrl $11, %esi 211 ; AVX-NEXT: andl $4, %esi 212 ; AVX-NEXT: orl %edx, %esi 213 ; AVX-NEXT: shrl $13, %edi 214 ; AVX-NEXT: andl $2, %edi 215 ; AVX-NEXT: orl %esi, %edi 216 ; AVX-NEXT: shrl $15, %ecx 217 ; AVX-NEXT: orl %edi, %ecx 218 ; AVX-NEXT: orl %ecx, %eax 219 ; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 220 ; AVX-NEXT: retq 221 ; 222 ; XOP-LABEL: test_bitreverse_i16: 223 ; XOP: # BB#0: 224 ; XOP-NEXT: vmovd %edi, %xmm0 225 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 226 ; XOP-NEXT: vmovd %xmm0, %eax 227 ; XOP-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 228 ; XOP-NEXT: retq 229 %b = call i16 @llvm.bitreverse.i16(i16 %a) 230 ret i16 %b 231 } 232 233 define i32 @test_bitreverse_i32(i32 %a) nounwind { 234 ; SSE-LABEL: test_bitreverse_i32: 235 ; SSE: # BB#0: 236 ; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 237 ; SSE-NEXT: movl %edi, %eax 238 ; SSE-NEXT: shll $31, %eax 239 ; SSE-NEXT: movl %edi, %ecx 240 ; SSE-NEXT: andl $2, %ecx 241 ; SSE-NEXT: shll $29, %ecx 242 ; SSE-NEXT: leal (%rcx,%rax), %eax 243 ; SSE-NEXT: movl %edi, %ecx 244 ; SSE-NEXT: andl $4, %ecx 245 ; SSE-NEXT: shll $27, %ecx 246 ; SSE-NEXT: orl %ecx, %eax 247 ; SSE-NEXT: movl %edi, %ecx 248 ; SSE-NEXT: andl $8, %ecx 249 ; SSE-NEXT: shll $25, %ecx 250 ; SSE-NEXT: orl %ecx, %eax 251 ; SSE-NEXT: movl %edi, %ecx 252 ; SSE-NEXT: andl $16, %ecx 253 ; SSE-NEXT: shll $23, %ecx 254 ; SSE-NEXT: orl %ecx, %eax 255 ; SSE-NEXT: movl %edi, %ecx 256 ; SSE-NEXT: andl $32, %ecx 257 ; SSE-NEXT: shll $21, %ecx 258 ; SSE-NEXT: orl %ecx, %eax 259 ; SSE-NEXT: movl %edi, %ecx 260 ; SSE-NEXT: andl $64, %ecx 261 ; SSE-NEXT: shll $19, %ecx 262 ; SSE-NEXT: movl %edi, %edx 263 ; SSE-NEXT: shll $17, %edx 264 ; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000 265 ; SSE-NEXT: orl %ecx, %edx 266 ; SSE-NEXT: movl %edi, %ecx 267 ; SSE-NEXT: shll $15, %ecx 268 ; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000 269 ; SSE-NEXT: orl %edx, %ecx 270 ; SSE-NEXT: movl %edi, %edx 271 ; SSE-NEXT: shll $13, %edx 272 ; SSE-NEXT: andl $4194304, %edx # imm = 0x400000 273 ; SSE-NEXT: orl %ecx, %edx 274 ; SSE-NEXT: movl %edi, %ecx 275 ; SSE-NEXT: shll $11, %ecx 276 ; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000 277 ; SSE-NEXT: orl %edx, %ecx 278 ; SSE-NEXT: movl %edi, %edx 279 ; SSE-NEXT: shll $9, %edx 280 ; SSE-NEXT: andl $1048576, %edx # imm = 0x100000 281 ; SSE-NEXT: orl %ecx, %edx 282 ; SSE-NEXT: movl %edi, %ecx 283 ; SSE-NEXT: shll $7, %ecx 284 ; SSE-NEXT: andl $524288, %ecx # imm = 0x80000 285 ; SSE-NEXT: orl %edx, %ecx 286 ; SSE-NEXT: movl %edi, %edx 287 ; SSE-NEXT: shll $5, %edx 288 ; SSE-NEXT: andl $262144, %edx # imm = 0x40000 289 ; SSE-NEXT: orl %ecx, %edx 290 ; SSE-NEXT: leal (,%rdi,8), %ecx 291 ; SSE-NEXT: andl $131072, %ecx # imm = 0x20000 292 ; SSE-NEXT: orl %edx, %ecx 293 ; SSE-NEXT: leal (%rdi,%rdi), %edx 294 ; SSE-NEXT: andl $65536, %edx # imm = 0x10000 295 ; SSE-NEXT: orl %ecx, %edx 296 ; SSE-NEXT: movl %edi, %ecx 297 ; SSE-NEXT: shrl %ecx 298 ; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 299 ; SSE-NEXT: orl %edx, %ecx 300 ; SSE-NEXT: movl %edi, %edx 301 ; SSE-NEXT: shrl $3, %edx 302 ; SSE-NEXT: andl $16384, %edx # imm = 0x4000 303 ; SSE-NEXT: orl %ecx, %edx 304 ; SSE-NEXT: movl %edi, %ecx 305 ; SSE-NEXT: shrl $5, %ecx 306 ; SSE-NEXT: andl $8192, %ecx # imm = 0x2000 307 ; SSE-NEXT: orl %edx, %ecx 308 ; SSE-NEXT: movl %edi, %edx 309 ; SSE-NEXT: shrl $7, %edx 310 ; SSE-NEXT: andl $4096, %edx # imm = 0x1000 311 ; SSE-NEXT: orl %ecx, %edx 312 ; SSE-NEXT: movl %edi, %ecx 313 ; SSE-NEXT: shrl $9, %ecx 314 ; SSE-NEXT: andl $2048, %ecx # imm = 0x800 315 ; SSE-NEXT: orl %edx, %ecx 316 ; SSE-NEXT: movl %edi, %edx 317 ; SSE-NEXT: shrl $11, %edx 318 ; SSE-NEXT: andl $1024, %edx # imm = 0x400 319 ; SSE-NEXT: orl %ecx, %edx 320 ; SSE-NEXT: movl %edi, %ecx 321 ; SSE-NEXT: shrl $13, %ecx 322 ; SSE-NEXT: andl $512, %ecx # imm = 0x200 323 ; SSE-NEXT: orl %edx, %ecx 324 ; SSE-NEXT: movl %edi, %edx 325 ; SSE-NEXT: shrl $15, %edx 326 ; SSE-NEXT: andl $256, %edx # imm = 0x100 327 ; SSE-NEXT: orl %ecx, %edx 328 ; SSE-NEXT: movl %edi, %ecx 329 ; SSE-NEXT: shrl $17, %ecx 330 ; SSE-NEXT: andl $128, %ecx 331 ; SSE-NEXT: orl %edx, %ecx 332 ; SSE-NEXT: movl %edi, %edx 333 ; SSE-NEXT: shrl $19, %edx 334 ; SSE-NEXT: andl $64, %edx 335 ; SSE-NEXT: orl %ecx, %edx 336 ; SSE-NEXT: movl %edi, %ecx 337 ; SSE-NEXT: shrl $21, %ecx 338 ; SSE-NEXT: andl $32, %ecx 339 ; SSE-NEXT: orl %edx, %ecx 340 ; SSE-NEXT: movl %edi, %edx 341 ; SSE-NEXT: shrl $23, %edx 342 ; SSE-NEXT: andl $16, %edx 343 ; SSE-NEXT: orl %ecx, %edx 344 ; SSE-NEXT: movl %edi, %ecx 345 ; SSE-NEXT: shrl $25, %ecx 346 ; SSE-NEXT: andl $8, %ecx 347 ; SSE-NEXT: orl %edx, %ecx 348 ; SSE-NEXT: movl %edi, %edx 349 ; SSE-NEXT: shrl $27, %edx 350 ; SSE-NEXT: andl $4, %edx 351 ; SSE-NEXT: orl %ecx, %edx 352 ; SSE-NEXT: movl %edi, %ecx 353 ; SSE-NEXT: shrl $29, %ecx 354 ; SSE-NEXT: andl $2, %ecx 355 ; SSE-NEXT: orl %edx, %ecx 356 ; SSE-NEXT: shrl $31, %edi 357 ; SSE-NEXT: orl %ecx, %edi 358 ; SSE-NEXT: orl %edi, %eax 359 ; SSE-NEXT: retq 360 ; 361 ; AVX-LABEL: test_bitreverse_i32: 362 ; AVX: # BB#0: 363 ; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def> 364 ; AVX-NEXT: movl %edi, %eax 365 ; AVX-NEXT: shll $31, %eax 366 ; AVX-NEXT: movl %edi, %ecx 367 ; AVX-NEXT: andl $2, %ecx 368 ; AVX-NEXT: shll $29, %ecx 369 ; AVX-NEXT: leal (%rcx,%rax), %eax 370 ; AVX-NEXT: movl %edi, %ecx 371 ; AVX-NEXT: andl $4, %ecx 372 ; AVX-NEXT: shll $27, %ecx 373 ; AVX-NEXT: orl %ecx, %eax 374 ; AVX-NEXT: movl %edi, %ecx 375 ; AVX-NEXT: andl $8, %ecx 376 ; AVX-NEXT: shll $25, %ecx 377 ; AVX-NEXT: orl %ecx, %eax 378 ; AVX-NEXT: movl %edi, %ecx 379 ; AVX-NEXT: andl $16, %ecx 380 ; AVX-NEXT: shll $23, %ecx 381 ; AVX-NEXT: orl %ecx, %eax 382 ; AVX-NEXT: movl %edi, %ecx 383 ; AVX-NEXT: andl $32, %ecx 384 ; AVX-NEXT: shll $21, %ecx 385 ; AVX-NEXT: orl %ecx, %eax 386 ; AVX-NEXT: movl %edi, %ecx 387 ; AVX-NEXT: andl $64, %ecx 388 ; AVX-NEXT: shll $19, %ecx 389 ; AVX-NEXT: movl %edi, %edx 390 ; AVX-NEXT: shll $17, %edx 391 ; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000 392 ; AVX-NEXT: orl %ecx, %edx 393 ; AVX-NEXT: movl %edi, %ecx 394 ; AVX-NEXT: shll $15, %ecx 395 ; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000 396 ; AVX-NEXT: orl %edx, %ecx 397 ; AVX-NEXT: movl %edi, %edx 398 ; AVX-NEXT: shll $13, %edx 399 ; AVX-NEXT: andl $4194304, %edx # imm = 0x400000 400 ; AVX-NEXT: orl %ecx, %edx 401 ; AVX-NEXT: movl %edi, %ecx 402 ; AVX-NEXT: shll $11, %ecx 403 ; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000 404 ; AVX-NEXT: orl %edx, %ecx 405 ; AVX-NEXT: movl %edi, %edx 406 ; AVX-NEXT: shll $9, %edx 407 ; AVX-NEXT: andl $1048576, %edx # imm = 0x100000 408 ; AVX-NEXT: orl %ecx, %edx 409 ; AVX-NEXT: movl %edi, %ecx 410 ; AVX-NEXT: shll $7, %ecx 411 ; AVX-NEXT: andl $524288, %ecx # imm = 0x80000 412 ; AVX-NEXT: orl %edx, %ecx 413 ; AVX-NEXT: movl %edi, %edx 414 ; AVX-NEXT: shll $5, %edx 415 ; AVX-NEXT: andl $262144, %edx # imm = 0x40000 416 ; AVX-NEXT: orl %ecx, %edx 417 ; AVX-NEXT: leal (,%rdi,8), %ecx 418 ; AVX-NEXT: andl $131072, %ecx # imm = 0x20000 419 ; AVX-NEXT: orl %edx, %ecx 420 ; AVX-NEXT: leal (%rdi,%rdi), %edx 421 ; AVX-NEXT: andl $65536, %edx # imm = 0x10000 422 ; AVX-NEXT: orl %ecx, %edx 423 ; AVX-NEXT: movl %edi, %ecx 424 ; AVX-NEXT: shrl %ecx 425 ; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 426 ; AVX-NEXT: orl %edx, %ecx 427 ; AVX-NEXT: movl %edi, %edx 428 ; AVX-NEXT: shrl $3, %edx 429 ; AVX-NEXT: andl $16384, %edx # imm = 0x4000 430 ; AVX-NEXT: orl %ecx, %edx 431 ; AVX-NEXT: movl %edi, %ecx 432 ; AVX-NEXT: shrl $5, %ecx 433 ; AVX-NEXT: andl $8192, %ecx # imm = 0x2000 434 ; AVX-NEXT: orl %edx, %ecx 435 ; AVX-NEXT: movl %edi, %edx 436 ; AVX-NEXT: shrl $7, %edx 437 ; AVX-NEXT: andl $4096, %edx # imm = 0x1000 438 ; AVX-NEXT: orl %ecx, %edx 439 ; AVX-NEXT: movl %edi, %ecx 440 ; AVX-NEXT: shrl $9, %ecx 441 ; AVX-NEXT: andl $2048, %ecx # imm = 0x800 442 ; AVX-NEXT: orl %edx, %ecx 443 ; AVX-NEXT: movl %edi, %edx 444 ; AVX-NEXT: shrl $11, %edx 445 ; AVX-NEXT: andl $1024, %edx # imm = 0x400 446 ; AVX-NEXT: orl %ecx, %edx 447 ; AVX-NEXT: movl %edi, %ecx 448 ; AVX-NEXT: shrl $13, %ecx 449 ; AVX-NEXT: andl $512, %ecx # imm = 0x200 450 ; AVX-NEXT: orl %edx, %ecx 451 ; AVX-NEXT: movl %edi, %edx 452 ; AVX-NEXT: shrl $15, %edx 453 ; AVX-NEXT: andl $256, %edx # imm = 0x100 454 ; AVX-NEXT: orl %ecx, %edx 455 ; AVX-NEXT: movl %edi, %ecx 456 ; AVX-NEXT: shrl $17, %ecx 457 ; AVX-NEXT: andl $128, %ecx 458 ; AVX-NEXT: orl %edx, %ecx 459 ; AVX-NEXT: movl %edi, %edx 460 ; AVX-NEXT: shrl $19, %edx 461 ; AVX-NEXT: andl $64, %edx 462 ; AVX-NEXT: orl %ecx, %edx 463 ; AVX-NEXT: movl %edi, %ecx 464 ; AVX-NEXT: shrl $21, %ecx 465 ; AVX-NEXT: andl $32, %ecx 466 ; AVX-NEXT: orl %edx, %ecx 467 ; AVX-NEXT: movl %edi, %edx 468 ; AVX-NEXT: shrl $23, %edx 469 ; AVX-NEXT: andl $16, %edx 470 ; AVX-NEXT: orl %ecx, %edx 471 ; AVX-NEXT: movl %edi, %ecx 472 ; AVX-NEXT: shrl $25, %ecx 473 ; AVX-NEXT: andl $8, %ecx 474 ; AVX-NEXT: orl %edx, %ecx 475 ; AVX-NEXT: movl %edi, %edx 476 ; AVX-NEXT: shrl $27, %edx 477 ; AVX-NEXT: andl $4, %edx 478 ; AVX-NEXT: orl %ecx, %edx 479 ; AVX-NEXT: movl %edi, %ecx 480 ; AVX-NEXT: shrl $29, %ecx 481 ; AVX-NEXT: andl $2, %ecx 482 ; AVX-NEXT: orl %edx, %ecx 483 ; AVX-NEXT: shrl $31, %edi 484 ; AVX-NEXT: orl %ecx, %edi 485 ; AVX-NEXT: orl %edi, %eax 486 ; AVX-NEXT: retq 487 ; 488 ; XOP-LABEL: test_bitreverse_i32: 489 ; XOP: # BB#0: 490 ; XOP-NEXT: vmovd %edi, %xmm0 491 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 492 ; XOP-NEXT: vmovd %xmm0, %eax 493 ; XOP-NEXT: retq 494 %b = call i32 @llvm.bitreverse.i32(i32 %a) 495 ret i32 %b 496 } 497 498 define i64 @test_bitreverse_i64(i64 %a) nounwind { 499 ; SSE-LABEL: test_bitreverse_i64: 500 ; SSE: # BB#0: 501 ; SSE-NEXT: leaq (%rdi,%rdi), %rax 502 ; SSE-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 503 ; SSE-NEXT: andq %rax, %rcx 504 ; SSE-NEXT: movq %rdi, %rax 505 ; SSE-NEXT: shlq $63, %rax 506 ; SSE-NEXT: movq %rdi, %rdx 507 ; SSE-NEXT: andq $2, %rdx 508 ; SSE-NEXT: shlq $61, %rdx 509 ; SSE-NEXT: leaq (%rdx,%rax), %rax 510 ; SSE-NEXT: movq %rdi, %rdx 511 ; SSE-NEXT: andq $4, %rdx 512 ; SSE-NEXT: shlq $59, %rdx 513 ; SSE-NEXT: orq %rdx, %rax 514 ; SSE-NEXT: movq %rdi, %rdx 515 ; SSE-NEXT: andq $8, %rdx 516 ; SSE-NEXT: shlq $57, %rdx 517 ; SSE-NEXT: orq %rdx, %rax 518 ; SSE-NEXT: movq %rdi, %rdx 519 ; SSE-NEXT: andq $16, %rdx 520 ; SSE-NEXT: shlq $55, %rdx 521 ; SSE-NEXT: orq %rdx, %rax 522 ; SSE-NEXT: movq %rdi, %rdx 523 ; SSE-NEXT: andq $32, %rdx 524 ; SSE-NEXT: shlq $53, %rdx 525 ; SSE-NEXT: orq %rdx, %rax 526 ; SSE-NEXT: movq %rdi, %rdx 527 ; SSE-NEXT: andq $64, %rdx 528 ; SSE-NEXT: shlq $51, %rdx 529 ; SSE-NEXT: movq %rdi, %rsi 530 ; SSE-NEXT: andq $128, %rsi 531 ; SSE-NEXT: shlq $49, %rsi 532 ; SSE-NEXT: orq %rdx, %rsi 533 ; SSE-NEXT: movq %rdi, %rdx 534 ; SSE-NEXT: andq $256, %rdx # imm = 0x100 535 ; SSE-NEXT: shlq $47, %rdx 536 ; SSE-NEXT: orq %rsi, %rdx 537 ; SSE-NEXT: movq %rdi, %rsi 538 ; SSE-NEXT: andq $512, %rsi # imm = 0x200 539 ; SSE-NEXT: shlq $45, %rsi 540 ; SSE-NEXT: orq %rdx, %rsi 541 ; SSE-NEXT: movq %rdi, %rdx 542 ; SSE-NEXT: andq $1024, %rdx # imm = 0x400 543 ; SSE-NEXT: shlq $43, %rdx 544 ; SSE-NEXT: orq %rsi, %rdx 545 ; SSE-NEXT: movq %rdi, %rsi 546 ; SSE-NEXT: andq $2048, %rsi # imm = 0x800 547 ; SSE-NEXT: shlq $41, %rsi 548 ; SSE-NEXT: orq %rdx, %rsi 549 ; SSE-NEXT: movq %rdi, %rdx 550 ; SSE-NEXT: andq $4096, %rdx # imm = 0x1000 551 ; SSE-NEXT: shlq $39, %rdx 552 ; SSE-NEXT: orq %rsi, %rdx 553 ; SSE-NEXT: movq %rdi, %rsi 554 ; SSE-NEXT: andq $8192, %rsi # imm = 0x2000 555 ; SSE-NEXT: shlq $37, %rsi 556 ; SSE-NEXT: orq %rdx, %rsi 557 ; SSE-NEXT: movq %rdi, %rdx 558 ; SSE-NEXT: andq $16384, %rdx # imm = 0x4000 559 ; SSE-NEXT: shlq $35, %rdx 560 ; SSE-NEXT: orq %rsi, %rdx 561 ; SSE-NEXT: movq %rdi, %rsi 562 ; SSE-NEXT: andq $32768, %rsi # imm = 0x8000 563 ; SSE-NEXT: shlq $33, %rsi 564 ; SSE-NEXT: orq %rdx, %rsi 565 ; SSE-NEXT: movq %rdi, %rdx 566 ; SSE-NEXT: andq $65536, %rdx # imm = 0x10000 567 ; SSE-NEXT: shlq $31, %rdx 568 ; SSE-NEXT: orq %rsi, %rdx 569 ; SSE-NEXT: movq %rdi, %rsi 570 ; SSE-NEXT: andq $131072, %rsi # imm = 0x20000 571 ; SSE-NEXT: shlq $29, %rsi 572 ; SSE-NEXT: orq %rdx, %rsi 573 ; SSE-NEXT: movq %rdi, %rdx 574 ; SSE-NEXT: andq $262144, %rdx # imm = 0x40000 575 ; SSE-NEXT: shlq $27, %rdx 576 ; SSE-NEXT: orq %rsi, %rdx 577 ; SSE-NEXT: movq %rdi, %rsi 578 ; SSE-NEXT: andq $524288, %rsi # imm = 0x80000 579 ; SSE-NEXT: shlq $25, %rsi 580 ; SSE-NEXT: orq %rdx, %rsi 581 ; SSE-NEXT: movq %rdi, %rdx 582 ; SSE-NEXT: andq $1048576, %rdx # imm = 0x100000 583 ; SSE-NEXT: shlq $23, %rdx 584 ; SSE-NEXT: orq %rsi, %rdx 585 ; SSE-NEXT: movq %rdi, %rsi 586 ; SSE-NEXT: andq $2097152, %rsi # imm = 0x200000 587 ; SSE-NEXT: shlq $21, %rsi 588 ; SSE-NEXT: orq %rdx, %rsi 589 ; SSE-NEXT: movq %rdi, %rdx 590 ; SSE-NEXT: andq $4194304, %rdx # imm = 0x400000 591 ; SSE-NEXT: shlq $19, %rdx 592 ; SSE-NEXT: orq %rsi, %rdx 593 ; SSE-NEXT: movq %rdi, %rsi 594 ; SSE-NEXT: andq $8388608, %rsi # imm = 0x800000 595 ; SSE-NEXT: shlq $17, %rsi 596 ; SSE-NEXT: orq %rdx, %rsi 597 ; SSE-NEXT: movq %rdi, %rdx 598 ; SSE-NEXT: andq $16777216, %rdx # imm = 0x1000000 599 ; SSE-NEXT: shlq $15, %rdx 600 ; SSE-NEXT: orq %rsi, %rdx 601 ; SSE-NEXT: movq %rdi, %rsi 602 ; SSE-NEXT: andq $33554432, %rsi # imm = 0x2000000 603 ; SSE-NEXT: shlq $13, %rsi 604 ; SSE-NEXT: orq %rdx, %rsi 605 ; SSE-NEXT: movq %rdi, %rdx 606 ; SSE-NEXT: andq $67108864, %rdx # imm = 0x4000000 607 ; SSE-NEXT: shlq $11, %rdx 608 ; SSE-NEXT: orq %rsi, %rdx 609 ; SSE-NEXT: movq %rdi, %rsi 610 ; SSE-NEXT: andq $134217728, %rsi # imm = 0x8000000 611 ; SSE-NEXT: shlq $9, %rsi 612 ; SSE-NEXT: orq %rdx, %rsi 613 ; SSE-NEXT: movq %rdi, %rdx 614 ; SSE-NEXT: andq $268435456, %rdx # imm = 0x10000000 615 ; SSE-NEXT: shlq $7, %rdx 616 ; SSE-NEXT: orq %rsi, %rdx 617 ; SSE-NEXT: movq %rdi, %rsi 618 ; SSE-NEXT: andq $536870912, %rsi # imm = 0x20000000 619 ; SSE-NEXT: shlq $5, %rsi 620 ; SSE-NEXT: orq %rdx, %rsi 621 ; SSE-NEXT: movq %rdi, %rdx 622 ; SSE-NEXT: andq $1073741824, %rdx # imm = 0x40000000 623 ; SSE-NEXT: shlq $3, %rdx 624 ; SSE-NEXT: orq %rsi, %rdx 625 ; SSE-NEXT: orq %rcx, %rdx 626 ; SSE-NEXT: movq %rdi, %rcx 627 ; SSE-NEXT: shrq %rcx 628 ; SSE-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 629 ; SSE-NEXT: orq %rdx, %rcx 630 ; SSE-NEXT: movq %rdi, %rdx 631 ; SSE-NEXT: shrq $3, %rdx 632 ; SSE-NEXT: andl $1073741824, %edx # imm = 0x40000000 633 ; SSE-NEXT: orq %rcx, %rdx 634 ; SSE-NEXT: movq %rdi, %rcx 635 ; SSE-NEXT: shrq $5, %rcx 636 ; SSE-NEXT: andl $536870912, %ecx # imm = 0x20000000 637 ; SSE-NEXT: orq %rdx, %rcx 638 ; SSE-NEXT: movq %rdi, %rdx 639 ; SSE-NEXT: shrq $7, %rdx 640 ; SSE-NEXT: andl $268435456, %edx # imm = 0x10000000 641 ; SSE-NEXT: orq %rcx, %rdx 642 ; SSE-NEXT: movq %rdi, %rcx 643 ; SSE-NEXT: shrq $9, %rcx 644 ; SSE-NEXT: andl $134217728, %ecx # imm = 0x8000000 645 ; SSE-NEXT: orq %rdx, %rcx 646 ; SSE-NEXT: movq %rdi, %rdx 647 ; SSE-NEXT: shrq $11, %rdx 648 ; SSE-NEXT: andl $67108864, %edx # imm = 0x4000000 649 ; SSE-NEXT: orq %rcx, %rdx 650 ; SSE-NEXT: movq %rdi, %rcx 651 ; SSE-NEXT: shrq $13, %rcx 652 ; SSE-NEXT: andl $33554432, %ecx # imm = 0x2000000 653 ; SSE-NEXT: orq %rdx, %rcx 654 ; SSE-NEXT: movq %rdi, %rdx 655 ; SSE-NEXT: shrq $15, %rdx 656 ; SSE-NEXT: andl $16777216, %edx # imm = 0x1000000 657 ; SSE-NEXT: orq %rcx, %rdx 658 ; SSE-NEXT: movq %rdi, %rcx 659 ; SSE-NEXT: shrq $17, %rcx 660 ; SSE-NEXT: andl $8388608, %ecx # imm = 0x800000 661 ; SSE-NEXT: orq %rdx, %rcx 662 ; SSE-NEXT: movq %rdi, %rdx 663 ; SSE-NEXT: shrq $19, %rdx 664 ; SSE-NEXT: andl $4194304, %edx # imm = 0x400000 665 ; SSE-NEXT: orq %rcx, %rdx 666 ; SSE-NEXT: movq %rdi, %rcx 667 ; SSE-NEXT: shrq $21, %rcx 668 ; SSE-NEXT: andl $2097152, %ecx # imm = 0x200000 669 ; SSE-NEXT: orq %rdx, %rcx 670 ; SSE-NEXT: movq %rdi, %rdx 671 ; SSE-NEXT: shrq $23, %rdx 672 ; SSE-NEXT: andl $1048576, %edx # imm = 0x100000 673 ; SSE-NEXT: orq %rcx, %rdx 674 ; SSE-NEXT: movq %rdi, %rcx 675 ; SSE-NEXT: shrq $25, %rcx 676 ; SSE-NEXT: andl $524288, %ecx # imm = 0x80000 677 ; SSE-NEXT: orq %rdx, %rcx 678 ; SSE-NEXT: movq %rdi, %rdx 679 ; SSE-NEXT: shrq $27, %rdx 680 ; SSE-NEXT: andl $262144, %edx # imm = 0x40000 681 ; SSE-NEXT: orq %rcx, %rdx 682 ; SSE-NEXT: movq %rdi, %rcx 683 ; SSE-NEXT: shrq $29, %rcx 684 ; SSE-NEXT: andl $131072, %ecx # imm = 0x20000 685 ; SSE-NEXT: orq %rdx, %rcx 686 ; SSE-NEXT: movq %rdi, %rdx 687 ; SSE-NEXT: shrq $31, %rdx 688 ; SSE-NEXT: andl $65536, %edx # imm = 0x10000 689 ; SSE-NEXT: orq %rcx, %rdx 690 ; SSE-NEXT: movq %rdi, %rcx 691 ; SSE-NEXT: shrq $33, %rcx 692 ; SSE-NEXT: andl $32768, %ecx # imm = 0x8000 693 ; SSE-NEXT: orq %rdx, %rcx 694 ; SSE-NEXT: movq %rdi, %rdx 695 ; SSE-NEXT: shrq $35, %rdx 696 ; SSE-NEXT: andl $16384, %edx # imm = 0x4000 697 ; SSE-NEXT: orq %rcx, %rdx 698 ; SSE-NEXT: movq %rdi, %rcx 699 ; SSE-NEXT: shrq $37, %rcx 700 ; SSE-NEXT: andl $8192, %ecx # imm = 0x2000 701 ; SSE-NEXT: orq %rdx, %rcx 702 ; SSE-NEXT: movq %rdi, %rdx 703 ; SSE-NEXT: shrq $39, %rdx 704 ; SSE-NEXT: andl $4096, %edx # imm = 0x1000 705 ; SSE-NEXT: orq %rcx, %rdx 706 ; SSE-NEXT: movq %rdi, %rcx 707 ; SSE-NEXT: shrq $41, %rcx 708 ; SSE-NEXT: andl $2048, %ecx # imm = 0x800 709 ; SSE-NEXT: orq %rdx, %rcx 710 ; SSE-NEXT: movq %rdi, %rdx 711 ; SSE-NEXT: shrq $43, %rdx 712 ; SSE-NEXT: andl $1024, %edx # imm = 0x400 713 ; SSE-NEXT: orq %rcx, %rdx 714 ; SSE-NEXT: movq %rdi, %rcx 715 ; SSE-NEXT: shrq $45, %rcx 716 ; SSE-NEXT: andl $512, %ecx # imm = 0x200 717 ; SSE-NEXT: orq %rdx, %rcx 718 ; SSE-NEXT: movq %rdi, %rdx 719 ; SSE-NEXT: shrq $47, %rdx 720 ; SSE-NEXT: andl $256, %edx # imm = 0x100 721 ; SSE-NEXT: orq %rcx, %rdx 722 ; SSE-NEXT: movq %rdi, %rcx 723 ; SSE-NEXT: shrq $49, %rcx 724 ; SSE-NEXT: andl $128, %ecx 725 ; SSE-NEXT: orq %rdx, %rcx 726 ; SSE-NEXT: movq %rdi, %rdx 727 ; SSE-NEXT: shrq $51, %rdx 728 ; SSE-NEXT: andl $64, %edx 729 ; SSE-NEXT: orq %rcx, %rdx 730 ; SSE-NEXT: movq %rdi, %rcx 731 ; SSE-NEXT: shrq $53, %rcx 732 ; SSE-NEXT: andl $32, %ecx 733 ; SSE-NEXT: orq %rdx, %rcx 734 ; SSE-NEXT: movq %rdi, %rdx 735 ; SSE-NEXT: shrq $55, %rdx 736 ; SSE-NEXT: andl $16, %edx 737 ; SSE-NEXT: orq %rcx, %rdx 738 ; SSE-NEXT: movq %rdi, %rcx 739 ; SSE-NEXT: shrq $57, %rcx 740 ; SSE-NEXT: andl $8, %ecx 741 ; SSE-NEXT: orq %rdx, %rcx 742 ; SSE-NEXT: movq %rdi, %rdx 743 ; SSE-NEXT: shrq $59, %rdx 744 ; SSE-NEXT: andl $4, %edx 745 ; SSE-NEXT: orq %rcx, %rdx 746 ; SSE-NEXT: movq %rdi, %rcx 747 ; SSE-NEXT: shrq $61, %rcx 748 ; SSE-NEXT: andl $2, %ecx 749 ; SSE-NEXT: orq %rdx, %rcx 750 ; SSE-NEXT: shrq $63, %rdi 751 ; SSE-NEXT: orq %rcx, %rdi 752 ; SSE-NEXT: orq %rdi, %rax 753 ; SSE-NEXT: retq 754 ; 755 ; AVX-LABEL: test_bitreverse_i64: 756 ; AVX: # BB#0: 757 ; AVX-NEXT: leaq (%rdi,%rdi), %rax 758 ; AVX-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000 759 ; AVX-NEXT: andq %rax, %rcx 760 ; AVX-NEXT: movq %rdi, %rax 761 ; AVX-NEXT: shlq $63, %rax 762 ; AVX-NEXT: movq %rdi, %rdx 763 ; AVX-NEXT: andq $2, %rdx 764 ; AVX-NEXT: shlq $61, %rdx 765 ; AVX-NEXT: leaq (%rdx,%rax), %rax 766 ; AVX-NEXT: movq %rdi, %rdx 767 ; AVX-NEXT: andq $4, %rdx 768 ; AVX-NEXT: shlq $59, %rdx 769 ; AVX-NEXT: orq %rdx, %rax 770 ; AVX-NEXT: movq %rdi, %rdx 771 ; AVX-NEXT: andq $8, %rdx 772 ; AVX-NEXT: shlq $57, %rdx 773 ; AVX-NEXT: orq %rdx, %rax 774 ; AVX-NEXT: movq %rdi, %rdx 775 ; AVX-NEXT: andq $16, %rdx 776 ; AVX-NEXT: shlq $55, %rdx 777 ; AVX-NEXT: orq %rdx, %rax 778 ; AVX-NEXT: movq %rdi, %rdx 779 ; AVX-NEXT: andq $32, %rdx 780 ; AVX-NEXT: shlq $53, %rdx 781 ; AVX-NEXT: orq %rdx, %rax 782 ; AVX-NEXT: movq %rdi, %rdx 783 ; AVX-NEXT: andq $64, %rdx 784 ; AVX-NEXT: shlq $51, %rdx 785 ; AVX-NEXT: movq %rdi, %rsi 786 ; AVX-NEXT: andq $128, %rsi 787 ; AVX-NEXT: shlq $49, %rsi 788 ; AVX-NEXT: orq %rdx, %rsi 789 ; AVX-NEXT: movq %rdi, %rdx 790 ; AVX-NEXT: andq $256, %rdx # imm = 0x100 791 ; AVX-NEXT: shlq $47, %rdx 792 ; AVX-NEXT: orq %rsi, %rdx 793 ; AVX-NEXT: movq %rdi, %rsi 794 ; AVX-NEXT: andq $512, %rsi # imm = 0x200 795 ; AVX-NEXT: shlq $45, %rsi 796 ; AVX-NEXT: orq %rdx, %rsi 797 ; AVX-NEXT: movq %rdi, %rdx 798 ; AVX-NEXT: andq $1024, %rdx # imm = 0x400 799 ; AVX-NEXT: shlq $43, %rdx 800 ; AVX-NEXT: orq %rsi, %rdx 801 ; AVX-NEXT: movq %rdi, %rsi 802 ; AVX-NEXT: andq $2048, %rsi # imm = 0x800 803 ; AVX-NEXT: shlq $41, %rsi 804 ; AVX-NEXT: orq %rdx, %rsi 805 ; AVX-NEXT: movq %rdi, %rdx 806 ; AVX-NEXT: andq $4096, %rdx # imm = 0x1000 807 ; AVX-NEXT: shlq $39, %rdx 808 ; AVX-NEXT: orq %rsi, %rdx 809 ; AVX-NEXT: movq %rdi, %rsi 810 ; AVX-NEXT: andq $8192, %rsi # imm = 0x2000 811 ; AVX-NEXT: shlq $37, %rsi 812 ; AVX-NEXT: orq %rdx, %rsi 813 ; AVX-NEXT: movq %rdi, %rdx 814 ; AVX-NEXT: andq $16384, %rdx # imm = 0x4000 815 ; AVX-NEXT: shlq $35, %rdx 816 ; AVX-NEXT: orq %rsi, %rdx 817 ; AVX-NEXT: movq %rdi, %rsi 818 ; AVX-NEXT: andq $32768, %rsi # imm = 0x8000 819 ; AVX-NEXT: shlq $33, %rsi 820 ; AVX-NEXT: orq %rdx, %rsi 821 ; AVX-NEXT: movq %rdi, %rdx 822 ; AVX-NEXT: andq $65536, %rdx # imm = 0x10000 823 ; AVX-NEXT: shlq $31, %rdx 824 ; AVX-NEXT: orq %rsi, %rdx 825 ; AVX-NEXT: movq %rdi, %rsi 826 ; AVX-NEXT: andq $131072, %rsi # imm = 0x20000 827 ; AVX-NEXT: shlq $29, %rsi 828 ; AVX-NEXT: orq %rdx, %rsi 829 ; AVX-NEXT: movq %rdi, %rdx 830 ; AVX-NEXT: andq $262144, %rdx # imm = 0x40000 831 ; AVX-NEXT: shlq $27, %rdx 832 ; AVX-NEXT: orq %rsi, %rdx 833 ; AVX-NEXT: movq %rdi, %rsi 834 ; AVX-NEXT: andq $524288, %rsi # imm = 0x80000 835 ; AVX-NEXT: shlq $25, %rsi 836 ; AVX-NEXT: orq %rdx, %rsi 837 ; AVX-NEXT: movq %rdi, %rdx 838 ; AVX-NEXT: andq $1048576, %rdx # imm = 0x100000 839 ; AVX-NEXT: shlq $23, %rdx 840 ; AVX-NEXT: orq %rsi, %rdx 841 ; AVX-NEXT: movq %rdi, %rsi 842 ; AVX-NEXT: andq $2097152, %rsi # imm = 0x200000 843 ; AVX-NEXT: shlq $21, %rsi 844 ; AVX-NEXT: orq %rdx, %rsi 845 ; AVX-NEXT: movq %rdi, %rdx 846 ; AVX-NEXT: andq $4194304, %rdx # imm = 0x400000 847 ; AVX-NEXT: shlq $19, %rdx 848 ; AVX-NEXT: orq %rsi, %rdx 849 ; AVX-NEXT: movq %rdi, %rsi 850 ; AVX-NEXT: andq $8388608, %rsi # imm = 0x800000 851 ; AVX-NEXT: shlq $17, %rsi 852 ; AVX-NEXT: orq %rdx, %rsi 853 ; AVX-NEXT: movq %rdi, %rdx 854 ; AVX-NEXT: andq $16777216, %rdx # imm = 0x1000000 855 ; AVX-NEXT: shlq $15, %rdx 856 ; AVX-NEXT: orq %rsi, %rdx 857 ; AVX-NEXT: movq %rdi, %rsi 858 ; AVX-NEXT: andq $33554432, %rsi # imm = 0x2000000 859 ; AVX-NEXT: shlq $13, %rsi 860 ; AVX-NEXT: orq %rdx, %rsi 861 ; AVX-NEXT: movq %rdi, %rdx 862 ; AVX-NEXT: andq $67108864, %rdx # imm = 0x4000000 863 ; AVX-NEXT: shlq $11, %rdx 864 ; AVX-NEXT: orq %rsi, %rdx 865 ; AVX-NEXT: movq %rdi, %rsi 866 ; AVX-NEXT: andq $134217728, %rsi # imm = 0x8000000 867 ; AVX-NEXT: shlq $9, %rsi 868 ; AVX-NEXT: orq %rdx, %rsi 869 ; AVX-NEXT: movq %rdi, %rdx 870 ; AVX-NEXT: andq $268435456, %rdx # imm = 0x10000000 871 ; AVX-NEXT: shlq $7, %rdx 872 ; AVX-NEXT: orq %rsi, %rdx 873 ; AVX-NEXT: movq %rdi, %rsi 874 ; AVX-NEXT: andq $536870912, %rsi # imm = 0x20000000 875 ; AVX-NEXT: shlq $5, %rsi 876 ; AVX-NEXT: orq %rdx, %rsi 877 ; AVX-NEXT: movq %rdi, %rdx 878 ; AVX-NEXT: andq $1073741824, %rdx # imm = 0x40000000 879 ; AVX-NEXT: shlq $3, %rdx 880 ; AVX-NEXT: orq %rsi, %rdx 881 ; AVX-NEXT: orq %rcx, %rdx 882 ; AVX-NEXT: movq %rdi, %rcx 883 ; AVX-NEXT: shrq %rcx 884 ; AVX-NEXT: andl $-2147483648, %ecx # imm = 0x80000000 885 ; AVX-NEXT: orq %rdx, %rcx 886 ; AVX-NEXT: movq %rdi, %rdx 887 ; AVX-NEXT: shrq $3, %rdx 888 ; AVX-NEXT: andl $1073741824, %edx # imm = 0x40000000 889 ; AVX-NEXT: orq %rcx, %rdx 890 ; AVX-NEXT: movq %rdi, %rcx 891 ; AVX-NEXT: shrq $5, %rcx 892 ; AVX-NEXT: andl $536870912, %ecx # imm = 0x20000000 893 ; AVX-NEXT: orq %rdx, %rcx 894 ; AVX-NEXT: movq %rdi, %rdx 895 ; AVX-NEXT: shrq $7, %rdx 896 ; AVX-NEXT: andl $268435456, %edx # imm = 0x10000000 897 ; AVX-NEXT: orq %rcx, %rdx 898 ; AVX-NEXT: movq %rdi, %rcx 899 ; AVX-NEXT: shrq $9, %rcx 900 ; AVX-NEXT: andl $134217728, %ecx # imm = 0x8000000 901 ; AVX-NEXT: orq %rdx, %rcx 902 ; AVX-NEXT: movq %rdi, %rdx 903 ; AVX-NEXT: shrq $11, %rdx 904 ; AVX-NEXT: andl $67108864, %edx # imm = 0x4000000 905 ; AVX-NEXT: orq %rcx, %rdx 906 ; AVX-NEXT: movq %rdi, %rcx 907 ; AVX-NEXT: shrq $13, %rcx 908 ; AVX-NEXT: andl $33554432, %ecx # imm = 0x2000000 909 ; AVX-NEXT: orq %rdx, %rcx 910 ; AVX-NEXT: movq %rdi, %rdx 911 ; AVX-NEXT: shrq $15, %rdx 912 ; AVX-NEXT: andl $16777216, %edx # imm = 0x1000000 913 ; AVX-NEXT: orq %rcx, %rdx 914 ; AVX-NEXT: movq %rdi, %rcx 915 ; AVX-NEXT: shrq $17, %rcx 916 ; AVX-NEXT: andl $8388608, %ecx # imm = 0x800000 917 ; AVX-NEXT: orq %rdx, %rcx 918 ; AVX-NEXT: movq %rdi, %rdx 919 ; AVX-NEXT: shrq $19, %rdx 920 ; AVX-NEXT: andl $4194304, %edx # imm = 0x400000 921 ; AVX-NEXT: orq %rcx, %rdx 922 ; AVX-NEXT: movq %rdi, %rcx 923 ; AVX-NEXT: shrq $21, %rcx 924 ; AVX-NEXT: andl $2097152, %ecx # imm = 0x200000 925 ; AVX-NEXT: orq %rdx, %rcx 926 ; AVX-NEXT: movq %rdi, %rdx 927 ; AVX-NEXT: shrq $23, %rdx 928 ; AVX-NEXT: andl $1048576, %edx # imm = 0x100000 929 ; AVX-NEXT: orq %rcx, %rdx 930 ; AVX-NEXT: movq %rdi, %rcx 931 ; AVX-NEXT: shrq $25, %rcx 932 ; AVX-NEXT: andl $524288, %ecx # imm = 0x80000 933 ; AVX-NEXT: orq %rdx, %rcx 934 ; AVX-NEXT: movq %rdi, %rdx 935 ; AVX-NEXT: shrq $27, %rdx 936 ; AVX-NEXT: andl $262144, %edx # imm = 0x40000 937 ; AVX-NEXT: orq %rcx, %rdx 938 ; AVX-NEXT: movq %rdi, %rcx 939 ; AVX-NEXT: shrq $29, %rcx 940 ; AVX-NEXT: andl $131072, %ecx # imm = 0x20000 941 ; AVX-NEXT: orq %rdx, %rcx 942 ; AVX-NEXT: movq %rdi, %rdx 943 ; AVX-NEXT: shrq $31, %rdx 944 ; AVX-NEXT: andl $65536, %edx # imm = 0x10000 945 ; AVX-NEXT: orq %rcx, %rdx 946 ; AVX-NEXT: movq %rdi, %rcx 947 ; AVX-NEXT: shrq $33, %rcx 948 ; AVX-NEXT: andl $32768, %ecx # imm = 0x8000 949 ; AVX-NEXT: orq %rdx, %rcx 950 ; AVX-NEXT: movq %rdi, %rdx 951 ; AVX-NEXT: shrq $35, %rdx 952 ; AVX-NEXT: andl $16384, %edx # imm = 0x4000 953 ; AVX-NEXT: orq %rcx, %rdx 954 ; AVX-NEXT: movq %rdi, %rcx 955 ; AVX-NEXT: shrq $37, %rcx 956 ; AVX-NEXT: andl $8192, %ecx # imm = 0x2000 957 ; AVX-NEXT: orq %rdx, %rcx 958 ; AVX-NEXT: movq %rdi, %rdx 959 ; AVX-NEXT: shrq $39, %rdx 960 ; AVX-NEXT: andl $4096, %edx # imm = 0x1000 961 ; AVX-NEXT: orq %rcx, %rdx 962 ; AVX-NEXT: movq %rdi, %rcx 963 ; AVX-NEXT: shrq $41, %rcx 964 ; AVX-NEXT: andl $2048, %ecx # imm = 0x800 965 ; AVX-NEXT: orq %rdx, %rcx 966 ; AVX-NEXT: movq %rdi, %rdx 967 ; AVX-NEXT: shrq $43, %rdx 968 ; AVX-NEXT: andl $1024, %edx # imm = 0x400 969 ; AVX-NEXT: orq %rcx, %rdx 970 ; AVX-NEXT: movq %rdi, %rcx 971 ; AVX-NEXT: shrq $45, %rcx 972 ; AVX-NEXT: andl $512, %ecx # imm = 0x200 973 ; AVX-NEXT: orq %rdx, %rcx 974 ; AVX-NEXT: movq %rdi, %rdx 975 ; AVX-NEXT: shrq $47, %rdx 976 ; AVX-NEXT: andl $256, %edx # imm = 0x100 977 ; AVX-NEXT: orq %rcx, %rdx 978 ; AVX-NEXT: movq %rdi, %rcx 979 ; AVX-NEXT: shrq $49, %rcx 980 ; AVX-NEXT: andl $128, %ecx 981 ; AVX-NEXT: orq %rdx, %rcx 982 ; AVX-NEXT: movq %rdi, %rdx 983 ; AVX-NEXT: shrq $51, %rdx 984 ; AVX-NEXT: andl $64, %edx 985 ; AVX-NEXT: orq %rcx, %rdx 986 ; AVX-NEXT: movq %rdi, %rcx 987 ; AVX-NEXT: shrq $53, %rcx 988 ; AVX-NEXT: andl $32, %ecx 989 ; AVX-NEXT: orq %rdx, %rcx 990 ; AVX-NEXT: movq %rdi, %rdx 991 ; AVX-NEXT: shrq $55, %rdx 992 ; AVX-NEXT: andl $16, %edx 993 ; AVX-NEXT: orq %rcx, %rdx 994 ; AVX-NEXT: movq %rdi, %rcx 995 ; AVX-NEXT: shrq $57, %rcx 996 ; AVX-NEXT: andl $8, %ecx 997 ; AVX-NEXT: orq %rdx, %rcx 998 ; AVX-NEXT: movq %rdi, %rdx 999 ; AVX-NEXT: shrq $59, %rdx 1000 ; AVX-NEXT: andl $4, %edx 1001 ; AVX-NEXT: orq %rcx, %rdx 1002 ; AVX-NEXT: movq %rdi, %rcx 1003 ; AVX-NEXT: shrq $61, %rcx 1004 ; AVX-NEXT: andl $2, %ecx 1005 ; AVX-NEXT: orq %rdx, %rcx 1006 ; AVX-NEXT: shrq $63, %rdi 1007 ; AVX-NEXT: orq %rcx, %rdi 1008 ; AVX-NEXT: orq %rdi, %rax 1009 ; AVX-NEXT: retq 1010 ; 1011 ; XOP-LABEL: test_bitreverse_i64: 1012 ; XOP: # BB#0: 1013 ; XOP-NEXT: vmovq %rdi, %xmm0 1014 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1015 ; XOP-NEXT: vmovq %xmm0, %rax 1016 ; XOP-NEXT: retq 1017 %b = call i64 @llvm.bitreverse.i64(i64 %a) 1018 ret i64 %b 1019 } 1020 1021 define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { 1022 ; SSE2-LABEL: test_bitreverse_v16i8: 1023 ; SSE2: # BB#0: 1024 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1025 ; SSE2-NEXT: psrlw $7, %xmm2 1026 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1027 ; SSE2-NEXT: pand %xmm1, %xmm1 1028 ; SSE2-NEXT: pand %xmm2, %xmm1 1029 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1030 ; SSE2-NEXT: psllw $7, %xmm2 1031 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1032 ; SSE2-NEXT: pand %xmm3, %xmm3 1033 ; SSE2-NEXT: pand %xmm3, %xmm2 1034 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1035 ; SSE2-NEXT: psllw $5, %xmm3 1036 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1037 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1038 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1039 ; SSE2-NEXT: psllw $3, %xmm4 1040 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1041 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1042 ; SSE2-NEXT: por %xmm3, %xmm4 1043 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1044 ; SSE2-NEXT: paddb %xmm3, %xmm3 1045 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1046 ; SSE2-NEXT: por %xmm4, %xmm3 1047 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1048 ; SSE2-NEXT: psrlw $1, %xmm4 1049 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1050 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 1051 ; SSE2-NEXT: por %xmm3, %xmm4 1052 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1053 ; SSE2-NEXT: psrlw $3, %xmm3 1054 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1055 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1056 ; SSE2-NEXT: por %xmm4, %xmm3 1057 ; SSE2-NEXT: psrlw $5, %xmm0 1058 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1059 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1060 ; SSE2-NEXT: por %xmm3, %xmm0 1061 ; SSE2-NEXT: por %xmm1, %xmm0 1062 ; SSE2-NEXT: por %xmm2, %xmm0 1063 ; SSE2-NEXT: retq 1064 ; 1065 ; SSSE3-LABEL: test_bitreverse_v16i8: 1066 ; SSSE3: # BB#0: 1067 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1068 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1069 ; SSSE3-NEXT: pand %xmm1, %xmm2 1070 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1071 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 1072 ; SSSE3-NEXT: psrlw $4, %xmm0 1073 ; SSSE3-NEXT: pand %xmm1, %xmm0 1074 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1075 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 1076 ; SSSE3-NEXT: por %xmm3, %xmm1 1077 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1078 ; SSSE3-NEXT: retq 1079 ; 1080 ; AVX-LABEL: test_bitreverse_v16i8: 1081 ; AVX: # BB#0: 1082 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1083 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1084 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1085 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1086 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1087 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1088 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1089 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1090 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1091 ; AVX-NEXT: retq 1092 ; 1093 ; XOP-LABEL: test_bitreverse_v16i8: 1094 ; XOP: # BB#0: 1095 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1096 ; XOP-NEXT: retq 1097 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) 1098 ret <16 x i8> %b 1099 } 1100 1101 define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { 1102 ; SSE2-LABEL: test_bitreverse_v8i16: 1103 ; SSE2: # BB#0: 1104 ; SSE2-NEXT: pxor %xmm1, %xmm1 1105 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1106 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1107 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1108 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1109 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1110 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1111 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] 1112 ; SSE2-NEXT: packuswb %xmm2, %xmm1 1113 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1114 ; SSE2-NEXT: psllw $7, %xmm0 1115 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1116 ; SSE2-NEXT: pand %xmm2, %xmm2 1117 ; SSE2-NEXT: pand %xmm0, %xmm2 1118 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1119 ; SSE2-NEXT: psllw $5, %xmm0 1120 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1121 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1122 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1123 ; SSE2-NEXT: psllw $3, %xmm3 1124 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1125 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1126 ; SSE2-NEXT: por %xmm0, %xmm3 1127 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1128 ; SSE2-NEXT: paddb %xmm0, %xmm0 1129 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1130 ; SSE2-NEXT: por %xmm3, %xmm0 1131 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1132 ; SSE2-NEXT: psrlw $1, %xmm3 1133 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1134 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1135 ; SSE2-NEXT: por %xmm0, %xmm3 1136 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1137 ; SSE2-NEXT: psrlw $3, %xmm0 1138 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1139 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1140 ; SSE2-NEXT: por %xmm3, %xmm0 1141 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1142 ; SSE2-NEXT: psrlw $5, %xmm3 1143 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1144 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1145 ; SSE2-NEXT: por %xmm0, %xmm3 1146 ; SSE2-NEXT: psrlw $7, %xmm1 1147 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1148 ; SSE2-NEXT: pand %xmm0, %xmm0 1149 ; SSE2-NEXT: pand %xmm1, %xmm0 1150 ; SSE2-NEXT: por %xmm3, %xmm0 1151 ; SSE2-NEXT: por %xmm2, %xmm0 1152 ; SSE2-NEXT: retq 1153 ; 1154 ; SSSE3-LABEL: test_bitreverse_v8i16: 1155 ; SSSE3: # BB#0: 1156 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1157 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1158 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1159 ; SSSE3-NEXT: pand %xmm1, %xmm2 1160 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1161 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 1162 ; SSSE3-NEXT: psrlw $4, %xmm0 1163 ; SSSE3-NEXT: pand %xmm1, %xmm0 1164 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1165 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 1166 ; SSSE3-NEXT: por %xmm3, %xmm1 1167 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1168 ; SSSE3-NEXT: retq 1169 ; 1170 ; AVX-LABEL: test_bitreverse_v8i16: 1171 ; AVX: # BB#0: 1172 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1173 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1174 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1175 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1176 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1177 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1178 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1179 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1180 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1181 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1182 ; AVX-NEXT: retq 1183 ; 1184 ; XOP-LABEL: test_bitreverse_v8i16: 1185 ; XOP: # BB#0: 1186 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1187 ; XOP-NEXT: retq 1188 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) 1189 ret <8 x i16> %b 1190 } 1191 1192 define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { 1193 ; SSE2-LABEL: test_bitreverse_v4i32: 1194 ; SSE2: # BB#0: 1195 ; SSE2-NEXT: pxor %xmm1, %xmm1 1196 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1197 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1198 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1199 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1200 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1201 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1202 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 1203 ; SSE2-NEXT: packuswb %xmm2, %xmm1 1204 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1205 ; SSE2-NEXT: psllw $7, %xmm0 1206 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1207 ; SSE2-NEXT: pand %xmm2, %xmm2 1208 ; SSE2-NEXT: pand %xmm0, %xmm2 1209 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1210 ; SSE2-NEXT: psllw $5, %xmm0 1211 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1212 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1213 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1214 ; SSE2-NEXT: psllw $3, %xmm3 1215 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1216 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1217 ; SSE2-NEXT: por %xmm0, %xmm3 1218 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1219 ; SSE2-NEXT: paddb %xmm0, %xmm0 1220 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1221 ; SSE2-NEXT: por %xmm3, %xmm0 1222 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1223 ; SSE2-NEXT: psrlw $1, %xmm3 1224 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1225 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1226 ; SSE2-NEXT: por %xmm0, %xmm3 1227 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1228 ; SSE2-NEXT: psrlw $3, %xmm0 1229 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1230 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1231 ; SSE2-NEXT: por %xmm3, %xmm0 1232 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1233 ; SSE2-NEXT: psrlw $5, %xmm3 1234 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1235 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1236 ; SSE2-NEXT: por %xmm0, %xmm3 1237 ; SSE2-NEXT: psrlw $7, %xmm1 1238 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1239 ; SSE2-NEXT: pand %xmm0, %xmm0 1240 ; SSE2-NEXT: pand %xmm1, %xmm0 1241 ; SSE2-NEXT: por %xmm3, %xmm0 1242 ; SSE2-NEXT: por %xmm2, %xmm0 1243 ; SSE2-NEXT: retq 1244 ; 1245 ; SSSE3-LABEL: test_bitreverse_v4i32: 1246 ; SSSE3: # BB#0: 1247 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1248 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1249 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1250 ; SSSE3-NEXT: pand %xmm1, %xmm2 1251 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1252 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 1253 ; SSSE3-NEXT: psrlw $4, %xmm0 1254 ; SSSE3-NEXT: pand %xmm1, %xmm0 1255 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1256 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 1257 ; SSSE3-NEXT: por %xmm3, %xmm1 1258 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1259 ; SSSE3-NEXT: retq 1260 ; 1261 ; AVX-LABEL: test_bitreverse_v4i32: 1262 ; AVX: # BB#0: 1263 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1264 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1265 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1266 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1267 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1268 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1269 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1270 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1271 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1272 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1273 ; AVX-NEXT: retq 1274 ; 1275 ; XOP-LABEL: test_bitreverse_v4i32: 1276 ; XOP: # BB#0: 1277 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1278 ; XOP-NEXT: retq 1279 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) 1280 ret <4 x i32> %b 1281 } 1282 1283 define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { 1284 ; SSE2-LABEL: test_bitreverse_v2i64: 1285 ; SSE2: # BB#0: 1286 ; SSE2-NEXT: pxor %xmm1, %xmm1 1287 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1288 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1289 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1290 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1291 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1292 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1293 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1294 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1295 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 1296 ; SSE2-NEXT: packuswb %xmm2, %xmm1 1297 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1298 ; SSE2-NEXT: psllw $7, %xmm0 1299 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1300 ; SSE2-NEXT: pand %xmm2, %xmm2 1301 ; SSE2-NEXT: pand %xmm0, %xmm2 1302 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1303 ; SSE2-NEXT: psllw $5, %xmm0 1304 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1305 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1306 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1307 ; SSE2-NEXT: psllw $3, %xmm3 1308 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1309 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1310 ; SSE2-NEXT: por %xmm0, %xmm3 1311 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1312 ; SSE2-NEXT: paddb %xmm0, %xmm0 1313 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1314 ; SSE2-NEXT: por %xmm3, %xmm0 1315 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1316 ; SSE2-NEXT: psrlw $1, %xmm3 1317 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1318 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1319 ; SSE2-NEXT: por %xmm0, %xmm3 1320 ; SSE2-NEXT: movdqa %xmm1, %xmm0 1321 ; SSE2-NEXT: psrlw $3, %xmm0 1322 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1323 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 1324 ; SSE2-NEXT: por %xmm3, %xmm0 1325 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1326 ; SSE2-NEXT: psrlw $5, %xmm3 1327 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1328 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 1329 ; SSE2-NEXT: por %xmm0, %xmm3 1330 ; SSE2-NEXT: psrlw $7, %xmm1 1331 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1332 ; SSE2-NEXT: pand %xmm0, %xmm0 1333 ; SSE2-NEXT: pand %xmm1, %xmm0 1334 ; SSE2-NEXT: por %xmm3, %xmm0 1335 ; SSE2-NEXT: por %xmm2, %xmm0 1336 ; SSE2-NEXT: retq 1337 ; 1338 ; SSSE3-LABEL: test_bitreverse_v2i64: 1339 ; SSSE3: # BB#0: 1340 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1341 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1342 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1343 ; SSSE3-NEXT: pand %xmm1, %xmm2 1344 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1345 ; SSSE3-NEXT: pshufb %xmm2, %xmm3 1346 ; SSSE3-NEXT: psrlw $4, %xmm0 1347 ; SSSE3-NEXT: pand %xmm1, %xmm0 1348 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1349 ; SSSE3-NEXT: pshufb %xmm0, %xmm1 1350 ; SSSE3-NEXT: por %xmm3, %xmm1 1351 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1352 ; SSSE3-NEXT: retq 1353 ; 1354 ; AVX-LABEL: test_bitreverse_v2i64: 1355 ; AVX: # BB#0: 1356 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 1357 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1358 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 1359 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1360 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 1361 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 1362 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 1363 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1364 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 1365 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 1366 ; AVX-NEXT: retq 1367 ; 1368 ; XOP-LABEL: test_bitreverse_v2i64: 1369 ; XOP: # BB#0: 1370 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0 1371 ; XOP-NEXT: retq 1372 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) 1373 ret <2 x i64> %b 1374 } 1375 1376 define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { 1377 ; SSE2-LABEL: test_bitreverse_v32i8: 1378 ; SSE2: # BB#0: 1379 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1380 ; SSE2-NEXT: psllw $5, %xmm2 1381 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1382 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm9 1383 ; SSE2-NEXT: pand %xmm9, %xmm2 1384 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1385 ; SSE2-NEXT: psllw $7, %xmm5 1386 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1387 ; SSE2-NEXT: pand %xmm10, %xmm10 1388 ; SSE2-NEXT: pand %xmm10, %xmm5 1389 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1390 ; SSE2-NEXT: psllw $3, %xmm3 1391 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1392 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm11 1393 ; SSE2-NEXT: pand %xmm11, %xmm3 1394 ; SSE2-NEXT: por %xmm2, %xmm3 1395 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1396 ; SSE2-NEXT: paddb %xmm2, %xmm2 1397 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1398 ; SSE2-NEXT: pand %xmm8, %xmm2 1399 ; SSE2-NEXT: por %xmm3, %xmm2 1400 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1401 ; SSE2-NEXT: psrlw $1, %xmm3 1402 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1403 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1404 ; SSE2-NEXT: pand %xmm12, %xmm3 1405 ; SSE2-NEXT: por %xmm2, %xmm3 1406 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1407 ; SSE2-NEXT: psrlw $3, %xmm4 1408 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1409 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1410 ; SSE2-NEXT: pand %xmm6, %xmm4 1411 ; SSE2-NEXT: por %xmm3, %xmm4 1412 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1413 ; SSE2-NEXT: psrlw $5, %xmm7 1414 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1415 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1416 ; SSE2-NEXT: pand %xmm2, %xmm7 1417 ; SSE2-NEXT: por %xmm4, %xmm7 1418 ; SSE2-NEXT: psrlw $7, %xmm0 1419 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1420 ; SSE2-NEXT: pand %xmm3, %xmm3 1421 ; SSE2-NEXT: pand %xmm3, %xmm0 1422 ; SSE2-NEXT: por %xmm7, %xmm0 1423 ; SSE2-NEXT: por %xmm5, %xmm0 1424 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1425 ; SSE2-NEXT: psllw $5, %xmm4 1426 ; SSE2-NEXT: pand %xmm9, %xmm4 1427 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1428 ; SSE2-NEXT: psllw $7, %xmm5 1429 ; SSE2-NEXT: pand %xmm10, %xmm5 1430 ; SSE2-NEXT: movdqa %xmm1, %xmm7 1431 ; SSE2-NEXT: psllw $3, %xmm7 1432 ; SSE2-NEXT: pand %xmm11, %xmm7 1433 ; SSE2-NEXT: por %xmm4, %xmm7 1434 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1435 ; SSE2-NEXT: paddb %xmm4, %xmm4 1436 ; SSE2-NEXT: pand %xmm8, %xmm4 1437 ; SSE2-NEXT: por %xmm7, %xmm4 1438 ; SSE2-NEXT: movdqa %xmm1, %xmm7 1439 ; SSE2-NEXT: psrlw $1, %xmm7 1440 ; SSE2-NEXT: pand %xmm12, %xmm7 1441 ; SSE2-NEXT: por %xmm4, %xmm7 1442 ; SSE2-NEXT: movdqa %xmm1, %xmm4 1443 ; SSE2-NEXT: psrlw $3, %xmm4 1444 ; SSE2-NEXT: pand %xmm6, %xmm4 1445 ; SSE2-NEXT: por %xmm7, %xmm4 1446 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1447 ; SSE2-NEXT: psrlw $5, %xmm6 1448 ; SSE2-NEXT: pand %xmm2, %xmm6 1449 ; SSE2-NEXT: por %xmm4, %xmm6 1450 ; SSE2-NEXT: psrlw $7, %xmm1 1451 ; SSE2-NEXT: pand %xmm3, %xmm1 1452 ; SSE2-NEXT: por %xmm6, %xmm1 1453 ; SSE2-NEXT: por %xmm5, %xmm1 1454 ; SSE2-NEXT: retq 1455 ; 1456 ; SSSE3-LABEL: test_bitreverse_v32i8: 1457 ; SSSE3: # BB#0: 1458 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1459 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1460 ; SSSE3-NEXT: pand %xmm4, %xmm2 1461 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1462 ; SSSE3-NEXT: movdqa %xmm5, %xmm6 1463 ; SSSE3-NEXT: pshufb %xmm2, %xmm6 1464 ; SSSE3-NEXT: psrlw $4, %xmm0 1465 ; SSSE3-NEXT: pand %xmm4, %xmm0 1466 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1467 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 1468 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 1469 ; SSSE3-NEXT: por %xmm6, %xmm3 1470 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1471 ; SSSE3-NEXT: pand %xmm4, %xmm0 1472 ; SSSE3-NEXT: pshufb %xmm0, %xmm5 1473 ; SSSE3-NEXT: psrlw $4, %xmm1 1474 ; SSSE3-NEXT: pand %xmm4, %xmm1 1475 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 1476 ; SSSE3-NEXT: por %xmm5, %xmm2 1477 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 1478 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 1479 ; SSSE3-NEXT: retq 1480 ; 1481 ; AVX1-LABEL: test_bitreverse_v32i8: 1482 ; AVX1: # BB#0: 1483 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1484 ; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1485 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 1486 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1487 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1488 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1489 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1490 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1491 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 1492 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 1493 ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 1494 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 1495 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1496 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1497 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 1498 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 1499 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1500 ; AVX1-NEXT: retq 1501 ; 1502 ; AVX2-LABEL: test_bitreverse_v32i8: 1503 ; AVX2: # BB#0: 1504 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1505 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1506 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1507 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1508 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1509 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1510 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1511 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1512 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1513 ; AVX2-NEXT: retq 1514 ; 1515 ; AVX512-LABEL: test_bitreverse_v32i8: 1516 ; AVX512: # BB#0: 1517 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1518 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1519 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1520 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1521 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1522 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1523 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1524 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1525 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1526 ; AVX512-NEXT: retq 1527 ; 1528 ; XOPAVX1-LABEL: test_bitreverse_v32i8: 1529 ; XOPAVX1: # BB#0: 1530 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1531 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1532 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1533 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1534 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1535 ; XOPAVX1-NEXT: retq 1536 ; 1537 ; XOPAVX2-LABEL: test_bitreverse_v32i8: 1538 ; XOPAVX2: # BB#0: 1539 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1540 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 1541 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1542 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1543 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1544 ; XOPAVX2-NEXT: retq 1545 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) 1546 ret <32 x i8> %b 1547 } 1548 1549 define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { 1550 ; SSE2-LABEL: test_bitreverse_v16i16: 1551 ; SSE2: # BB#0: 1552 ; SSE2-NEXT: pxor %xmm9, %xmm9 1553 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1554 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 1555 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 1556 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 1557 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1558 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 1559 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 1560 ; SSE2-NEXT: packuswb %xmm2, %xmm0 1561 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1562 ; SSE2-NEXT: psllw $5, %xmm2 1563 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1564 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 1565 ; SSE2-NEXT: pand %xmm10, %xmm2 1566 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1567 ; SSE2-NEXT: psllw $7, %xmm3 1568 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1569 ; SSE2-NEXT: pand %xmm11, %xmm11 1570 ; SSE2-NEXT: pand %xmm11, %xmm3 1571 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1572 ; SSE2-NEXT: psllw $3, %xmm4 1573 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1574 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1575 ; SSE2-NEXT: pand %xmm12, %xmm4 1576 ; SSE2-NEXT: por %xmm2, %xmm4 1577 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1578 ; SSE2-NEXT: paddb %xmm2, %xmm2 1579 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1580 ; SSE2-NEXT: pand %xmm8, %xmm2 1581 ; SSE2-NEXT: por %xmm4, %xmm2 1582 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1583 ; SSE2-NEXT: psrlw $1, %xmm4 1584 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1585 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 1586 ; SSE2-NEXT: pand %xmm13, %xmm4 1587 ; SSE2-NEXT: por %xmm2, %xmm4 1588 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1589 ; SSE2-NEXT: psrlw $3, %xmm5 1590 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1591 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1592 ; SSE2-NEXT: pand %xmm6, %xmm5 1593 ; SSE2-NEXT: por %xmm4, %xmm5 1594 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1595 ; SSE2-NEXT: psrlw $5, %xmm7 1596 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1597 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1598 ; SSE2-NEXT: pand %xmm2, %xmm7 1599 ; SSE2-NEXT: por %xmm5, %xmm7 1600 ; SSE2-NEXT: psrlw $7, %xmm0 1601 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1602 ; SSE2-NEXT: pand %xmm4, %xmm4 1603 ; SSE2-NEXT: pand %xmm4, %xmm0 1604 ; SSE2-NEXT: por %xmm7, %xmm0 1605 ; SSE2-NEXT: por %xmm3, %xmm0 1606 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1607 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 1608 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 1609 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 1610 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1611 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 1612 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 1613 ; SSE2-NEXT: packuswb %xmm3, %xmm1 1614 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1615 ; SSE2-NEXT: psllw $5, %xmm5 1616 ; SSE2-NEXT: pand %xmm10, %xmm5 1617 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1618 ; SSE2-NEXT: psllw $7, %xmm3 1619 ; SSE2-NEXT: pand %xmm11, %xmm3 1620 ; SSE2-NEXT: movdqa %xmm1, %xmm7 1621 ; SSE2-NEXT: psllw $3, %xmm7 1622 ; SSE2-NEXT: pand %xmm12, %xmm7 1623 ; SSE2-NEXT: por %xmm5, %xmm7 1624 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1625 ; SSE2-NEXT: paddb %xmm5, %xmm5 1626 ; SSE2-NEXT: pand %xmm8, %xmm5 1627 ; SSE2-NEXT: por %xmm7, %xmm5 1628 ; SSE2-NEXT: movdqa %xmm1, %xmm7 1629 ; SSE2-NEXT: psrlw $1, %xmm7 1630 ; SSE2-NEXT: pand %xmm13, %xmm7 1631 ; SSE2-NEXT: por %xmm5, %xmm7 1632 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1633 ; SSE2-NEXT: psrlw $3, %xmm5 1634 ; SSE2-NEXT: pand %xmm6, %xmm5 1635 ; SSE2-NEXT: por %xmm7, %xmm5 1636 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1637 ; SSE2-NEXT: psrlw $5, %xmm6 1638 ; SSE2-NEXT: pand %xmm2, %xmm6 1639 ; SSE2-NEXT: por %xmm5, %xmm6 1640 ; SSE2-NEXT: psrlw $7, %xmm1 1641 ; SSE2-NEXT: pand %xmm4, %xmm1 1642 ; SSE2-NEXT: por %xmm6, %xmm1 1643 ; SSE2-NEXT: por %xmm3, %xmm1 1644 ; SSE2-NEXT: retq 1645 ; 1646 ; SSSE3-LABEL: test_bitreverse_v16i16: 1647 ; SSSE3: # BB#0: 1648 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1649 ; SSSE3-NEXT: pshufb %xmm4, %xmm0 1650 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1651 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1652 ; SSSE3-NEXT: pand %xmm5, %xmm2 1653 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1654 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 1655 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 1656 ; SSSE3-NEXT: psrlw $4, %xmm0 1657 ; SSSE3-NEXT: pand %xmm5, %xmm0 1658 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1659 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 1660 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 1661 ; SSSE3-NEXT: por %xmm7, %xmm3 1662 ; SSSE3-NEXT: pshufb %xmm4, %xmm1 1663 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1664 ; SSSE3-NEXT: pand %xmm5, %xmm0 1665 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 1666 ; SSSE3-NEXT: psrlw $4, %xmm1 1667 ; SSSE3-NEXT: pand %xmm5, %xmm1 1668 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 1669 ; SSSE3-NEXT: por %xmm6, %xmm2 1670 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 1671 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 1672 ; SSSE3-NEXT: retq 1673 ; 1674 ; AVX1-LABEL: test_bitreverse_v16i16: 1675 ; AVX1: # BB#0: 1676 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1677 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 1678 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1679 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1680 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1681 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1682 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1683 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1684 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1685 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1686 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1687 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1688 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1689 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1690 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1691 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1692 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1693 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1694 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1695 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1696 ; AVX1-NEXT: retq 1697 ; 1698 ; AVX2-LABEL: test_bitreverse_v16i16: 1699 ; AVX2: # BB#0: 1700 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1701 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1702 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1703 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1704 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1705 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1706 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1707 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1708 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1709 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1710 ; AVX2-NEXT: retq 1711 ; 1712 ; AVX512-LABEL: test_bitreverse_v16i16: 1713 ; AVX512: # BB#0: 1714 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] 1715 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1716 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1717 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1718 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1719 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1720 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1721 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1722 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1723 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1724 ; AVX512-NEXT: retq 1725 ; 1726 ; XOPAVX1-LABEL: test_bitreverse_v16i16: 1727 ; XOPAVX1: # BB#0: 1728 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1729 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1730 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1731 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1732 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1733 ; XOPAVX1-NEXT: retq 1734 ; 1735 ; XOPAVX2-LABEL: test_bitreverse_v16i16: 1736 ; XOPAVX2: # BB#0: 1737 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1738 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 1739 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1740 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1741 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1742 ; XOPAVX2-NEXT: retq 1743 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) 1744 ret <16 x i16> %b 1745 } 1746 1747 define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { 1748 ; SSE2-LABEL: test_bitreverse_v8i32: 1749 ; SSE2: # BB#0: 1750 ; SSE2-NEXT: pxor %xmm9, %xmm9 1751 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1752 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 1753 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1754 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1755 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1756 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1757 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1758 ; SSE2-NEXT: packuswb %xmm2, %xmm0 1759 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1760 ; SSE2-NEXT: psllw $5, %xmm2 1761 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1762 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 1763 ; SSE2-NEXT: pand %xmm10, %xmm2 1764 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1765 ; SSE2-NEXT: psllw $7, %xmm3 1766 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1767 ; SSE2-NEXT: pand %xmm11, %xmm11 1768 ; SSE2-NEXT: pand %xmm11, %xmm3 1769 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1770 ; SSE2-NEXT: psllw $3, %xmm4 1771 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1772 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1773 ; SSE2-NEXT: pand %xmm12, %xmm4 1774 ; SSE2-NEXT: por %xmm2, %xmm4 1775 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1776 ; SSE2-NEXT: paddb %xmm2, %xmm2 1777 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1778 ; SSE2-NEXT: pand %xmm8, %xmm2 1779 ; SSE2-NEXT: por %xmm4, %xmm2 1780 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1781 ; SSE2-NEXT: psrlw $1, %xmm4 1782 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1783 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 1784 ; SSE2-NEXT: pand %xmm13, %xmm4 1785 ; SSE2-NEXT: por %xmm2, %xmm4 1786 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1787 ; SSE2-NEXT: psrlw $3, %xmm5 1788 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1789 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1790 ; SSE2-NEXT: pand %xmm6, %xmm5 1791 ; SSE2-NEXT: por %xmm4, %xmm5 1792 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1793 ; SSE2-NEXT: psrlw $5, %xmm7 1794 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1795 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1796 ; SSE2-NEXT: pand %xmm2, %xmm7 1797 ; SSE2-NEXT: por %xmm5, %xmm7 1798 ; SSE2-NEXT: psrlw $7, %xmm0 1799 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1800 ; SSE2-NEXT: pand %xmm4, %xmm4 1801 ; SSE2-NEXT: pand %xmm4, %xmm0 1802 ; SSE2-NEXT: por %xmm7, %xmm0 1803 ; SSE2-NEXT: por %xmm3, %xmm0 1804 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1805 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] 1806 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 1807 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 1808 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1809 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 1810 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 1811 ; SSE2-NEXT: packuswb %xmm3, %xmm1 1812 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1813 ; SSE2-NEXT: psllw $5, %xmm5 1814 ; SSE2-NEXT: pand %xmm10, %xmm5 1815 ; SSE2-NEXT: movdqa %xmm1, %xmm3 1816 ; SSE2-NEXT: psllw $7, %xmm3 1817 ; SSE2-NEXT: pand %xmm11, %xmm3 1818 ; SSE2-NEXT: movdqa %xmm1, %xmm7 1819 ; SSE2-NEXT: psllw $3, %xmm7 1820 ; SSE2-NEXT: pand %xmm12, %xmm7 1821 ; SSE2-NEXT: por %xmm5, %xmm7 1822 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1823 ; SSE2-NEXT: paddb %xmm5, %xmm5 1824 ; SSE2-NEXT: pand %xmm8, %xmm5 1825 ; SSE2-NEXT: por %xmm7, %xmm5 1826 ; SSE2-NEXT: movdqa %xmm1, %xmm7 1827 ; SSE2-NEXT: psrlw $1, %xmm7 1828 ; SSE2-NEXT: pand %xmm13, %xmm7 1829 ; SSE2-NEXT: por %xmm5, %xmm7 1830 ; SSE2-NEXT: movdqa %xmm1, %xmm5 1831 ; SSE2-NEXT: psrlw $3, %xmm5 1832 ; SSE2-NEXT: pand %xmm6, %xmm5 1833 ; SSE2-NEXT: por %xmm7, %xmm5 1834 ; SSE2-NEXT: movdqa %xmm1, %xmm6 1835 ; SSE2-NEXT: psrlw $5, %xmm6 1836 ; SSE2-NEXT: pand %xmm2, %xmm6 1837 ; SSE2-NEXT: por %xmm5, %xmm6 1838 ; SSE2-NEXT: psrlw $7, %xmm1 1839 ; SSE2-NEXT: pand %xmm4, %xmm1 1840 ; SSE2-NEXT: por %xmm6, %xmm1 1841 ; SSE2-NEXT: por %xmm3, %xmm1 1842 ; SSE2-NEXT: retq 1843 ; 1844 ; SSSE3-LABEL: test_bitreverse_v8i32: 1845 ; SSSE3: # BB#0: 1846 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1847 ; SSSE3-NEXT: pshufb %xmm4, %xmm0 1848 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1849 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 1850 ; SSSE3-NEXT: pand %xmm5, %xmm2 1851 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1852 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 1853 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 1854 ; SSSE3-NEXT: psrlw $4, %xmm0 1855 ; SSSE3-NEXT: pand %xmm5, %xmm0 1856 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1857 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 1858 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 1859 ; SSSE3-NEXT: por %xmm7, %xmm3 1860 ; SSSE3-NEXT: pshufb %xmm4, %xmm1 1861 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 1862 ; SSSE3-NEXT: pand %xmm5, %xmm0 1863 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 1864 ; SSSE3-NEXT: psrlw $4, %xmm1 1865 ; SSSE3-NEXT: pand %xmm5, %xmm1 1866 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 1867 ; SSSE3-NEXT: por %xmm6, %xmm2 1868 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 1869 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 1870 ; SSSE3-NEXT: retq 1871 ; 1872 ; AVX1-LABEL: test_bitreverse_v8i32: 1873 ; AVX1: # BB#0: 1874 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1875 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 1876 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1877 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1878 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 1879 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1880 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 1881 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 1882 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1883 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1884 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 1885 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 1886 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1887 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 1888 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 1889 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 1890 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1891 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 1892 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1893 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1894 ; AVX1-NEXT: retq 1895 ; 1896 ; AVX2-LABEL: test_bitreverse_v8i32: 1897 ; AVX2: # BB#0: 1898 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1899 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1900 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 1901 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1902 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1903 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 1904 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 1905 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1906 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1907 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 1908 ; AVX2-NEXT: retq 1909 ; 1910 ; AVX512-LABEL: test_bitreverse_v8i32: 1911 ; AVX512: # BB#0: 1912 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] 1913 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1914 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 1915 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 1916 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 1917 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 1918 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 1919 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 1920 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 1921 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 1922 ; AVX512-NEXT: retq 1923 ; 1924 ; XOPAVX1-LABEL: test_bitreverse_v8i32: 1925 ; XOPAVX1: # BB#0: 1926 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1927 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1928 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1929 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1930 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1931 ; XOPAVX1-NEXT: retq 1932 ; 1933 ; XOPAVX2-LABEL: test_bitreverse_v8i32: 1934 ; XOPAVX2: # BB#0: 1935 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1936 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 1937 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 1938 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 1939 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1940 ; XOPAVX2-NEXT: retq 1941 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) 1942 ret <8 x i32> %b 1943 } 1944 1945 define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { 1946 ; SSE2-LABEL: test_bitreverse_v4i64: 1947 ; SSE2: # BB#0: 1948 ; SSE2-NEXT: pxor %xmm9, %xmm9 1949 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1950 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] 1951 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1952 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 1953 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 1954 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1955 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1956 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 1957 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 1958 ; SSE2-NEXT: packuswb %xmm2, %xmm0 1959 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1960 ; SSE2-NEXT: psllw $5, %xmm2 1961 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1962 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 1963 ; SSE2-NEXT: pand %xmm10, %xmm2 1964 ; SSE2-NEXT: movdqa %xmm0, %xmm4 1965 ; SSE2-NEXT: psllw $7, %xmm4 1966 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 1967 ; SSE2-NEXT: pand %xmm11, %xmm11 1968 ; SSE2-NEXT: pand %xmm11, %xmm4 1969 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1970 ; SSE2-NEXT: psllw $3, %xmm3 1971 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 1972 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 1973 ; SSE2-NEXT: pand %xmm12, %xmm3 1974 ; SSE2-NEXT: por %xmm2, %xmm3 1975 ; SSE2-NEXT: movdqa %xmm0, %xmm2 1976 ; SSE2-NEXT: paddb %xmm2, %xmm2 1977 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 1978 ; SSE2-NEXT: pand %xmm8, %xmm2 1979 ; SSE2-NEXT: por %xmm3, %xmm2 1980 ; SSE2-NEXT: movdqa %xmm0, %xmm3 1981 ; SSE2-NEXT: psrlw $1, %xmm3 1982 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1983 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 1984 ; SSE2-NEXT: pand %xmm13, %xmm3 1985 ; SSE2-NEXT: por %xmm2, %xmm3 1986 ; SSE2-NEXT: movdqa %xmm0, %xmm5 1987 ; SSE2-NEXT: psrlw $3, %xmm5 1988 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 1989 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm6 1990 ; SSE2-NEXT: pand %xmm6, %xmm5 1991 ; SSE2-NEXT: por %xmm3, %xmm5 1992 ; SSE2-NEXT: movdqa %xmm0, %xmm7 1993 ; SSE2-NEXT: psrlw $5, %xmm7 1994 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 1995 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 1996 ; SSE2-NEXT: pand %xmm2, %xmm7 1997 ; SSE2-NEXT: por %xmm5, %xmm7 1998 ; SSE2-NEXT: psrlw $7, %xmm0 1999 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2000 ; SSE2-NEXT: pand %xmm3, %xmm3 2001 ; SSE2-NEXT: pand %xmm3, %xmm0 2002 ; SSE2-NEXT: por %xmm7, %xmm0 2003 ; SSE2-NEXT: por %xmm4, %xmm0 2004 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2005 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2006 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 2007 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2008 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2009 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 2010 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 2011 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2012 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2013 ; SSE2-NEXT: packuswb %xmm4, %xmm1 2014 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2015 ; SSE2-NEXT: psllw $5, %xmm5 2016 ; SSE2-NEXT: pand %xmm10, %xmm5 2017 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2018 ; SSE2-NEXT: psllw $7, %xmm4 2019 ; SSE2-NEXT: pand %xmm11, %xmm4 2020 ; SSE2-NEXT: movdqa %xmm1, %xmm7 2021 ; SSE2-NEXT: psllw $3, %xmm7 2022 ; SSE2-NEXT: pand %xmm12, %xmm7 2023 ; SSE2-NEXT: por %xmm5, %xmm7 2024 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2025 ; SSE2-NEXT: paddb %xmm5, %xmm5 2026 ; SSE2-NEXT: pand %xmm8, %xmm5 2027 ; SSE2-NEXT: por %xmm7, %xmm5 2028 ; SSE2-NEXT: movdqa %xmm1, %xmm7 2029 ; SSE2-NEXT: psrlw $1, %xmm7 2030 ; SSE2-NEXT: pand %xmm13, %xmm7 2031 ; SSE2-NEXT: por %xmm5, %xmm7 2032 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2033 ; SSE2-NEXT: psrlw $3, %xmm5 2034 ; SSE2-NEXT: pand %xmm6, %xmm5 2035 ; SSE2-NEXT: por %xmm7, %xmm5 2036 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2037 ; SSE2-NEXT: psrlw $5, %xmm6 2038 ; SSE2-NEXT: pand %xmm2, %xmm6 2039 ; SSE2-NEXT: por %xmm5, %xmm6 2040 ; SSE2-NEXT: psrlw $7, %xmm1 2041 ; SSE2-NEXT: pand %xmm3, %xmm1 2042 ; SSE2-NEXT: por %xmm6, %xmm1 2043 ; SSE2-NEXT: por %xmm4, %xmm1 2044 ; SSE2-NEXT: retq 2045 ; 2046 ; SSSE3-LABEL: test_bitreverse_v4i64: 2047 ; SSSE3: # BB#0: 2048 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2049 ; SSSE3-NEXT: pshufb %xmm4, %xmm0 2050 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2051 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 2052 ; SSSE3-NEXT: pand %xmm5, %xmm2 2053 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2054 ; SSSE3-NEXT: movdqa %xmm6, %xmm7 2055 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 2056 ; SSSE3-NEXT: psrlw $4, %xmm0 2057 ; SSSE3-NEXT: pand %xmm5, %xmm0 2058 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2059 ; SSSE3-NEXT: movdqa %xmm2, %xmm3 2060 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 2061 ; SSSE3-NEXT: por %xmm7, %xmm3 2062 ; SSSE3-NEXT: pshufb %xmm4, %xmm1 2063 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 2064 ; SSSE3-NEXT: pand %xmm5, %xmm0 2065 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 2066 ; SSSE3-NEXT: psrlw $4, %xmm1 2067 ; SSSE3-NEXT: pand %xmm5, %xmm1 2068 ; SSSE3-NEXT: pshufb %xmm1, %xmm2 2069 ; SSSE3-NEXT: por %xmm6, %xmm2 2070 ; SSSE3-NEXT: movdqa %xmm3, %xmm0 2071 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 2072 ; SSSE3-NEXT: retq 2073 ; 2074 ; AVX1-LABEL: test_bitreverse_v4i64: 2075 ; AVX1: # BB#0: 2076 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2077 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 2078 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2079 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2080 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 2081 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2082 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2083 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2084 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2085 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2086 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 2087 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2088 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2089 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 2090 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 2091 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2092 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2093 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 2094 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 2095 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2096 ; AVX1-NEXT: retq 2097 ; 2098 ; AVX2-LABEL: test_bitreverse_v4i64: 2099 ; AVX2: # BB#0: 2100 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 2101 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2102 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 2103 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2104 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 2105 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2106 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2107 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2108 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 2109 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 2110 ; AVX2-NEXT: retq 2111 ; 2112 ; AVX512-LABEL: test_bitreverse_v4i64: 2113 ; AVX512: # BB#0: 2114 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] 2115 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2116 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 2117 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2118 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 2119 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 2120 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2121 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2122 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 2123 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 2124 ; AVX512-NEXT: retq 2125 ; 2126 ; XOPAVX1-LABEL: test_bitreverse_v4i64: 2127 ; XOPAVX1: # BB#0: 2128 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2129 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2130 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 2131 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 2132 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2133 ; XOPAVX1-NEXT: retq 2134 ; 2135 ; XOPAVX2-LABEL: test_bitreverse_v4i64: 2136 ; XOPAVX2: # BB#0: 2137 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2138 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 2139 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1 2140 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0 2141 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2142 ; XOPAVX2-NEXT: retq 2143 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) 2144 ret <4 x i64> %b 2145 } 2146 2147 define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { 2148 ; SSE2-LABEL: test_bitreverse_v64i8: 2149 ; SSE2: # BB#0: 2150 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2151 ; SSE2-NEXT: psllw $5, %xmm4 2152 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 2153 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm9 2154 ; SSE2-NEXT: pand %xmm9, %xmm4 2155 ; SSE2-NEXT: movdqa %xmm0, %xmm7 2156 ; SSE2-NEXT: psllw $7, %xmm7 2157 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 2158 ; SSE2-NEXT: pand %xmm10, %xmm10 2159 ; SSE2-NEXT: pand %xmm10, %xmm7 2160 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2161 ; SSE2-NEXT: psllw $3, %xmm5 2162 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 2163 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm11 2164 ; SSE2-NEXT: pand %xmm11, %xmm5 2165 ; SSE2-NEXT: por %xmm4, %xmm5 2166 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2167 ; SSE2-NEXT: paddb %xmm4, %xmm4 2168 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2169 ; SSE2-NEXT: pand %xmm8, %xmm4 2170 ; SSE2-NEXT: por %xmm5, %xmm4 2171 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2172 ; SSE2-NEXT: psrlw $1, %xmm5 2173 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 2174 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 2175 ; SSE2-NEXT: pand %xmm12, %xmm5 2176 ; SSE2-NEXT: por %xmm4, %xmm5 2177 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2178 ; SSE2-NEXT: psrlw $3, %xmm6 2179 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 2180 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 2181 ; SSE2-NEXT: pand %xmm13, %xmm6 2182 ; SSE2-NEXT: por %xmm5, %xmm6 2183 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2184 ; SSE2-NEXT: psrlw $5, %xmm4 2185 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2186 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 2187 ; SSE2-NEXT: pand %xmm14, %xmm4 2188 ; SSE2-NEXT: por %xmm6, %xmm4 2189 ; SSE2-NEXT: psrlw $7, %xmm0 2190 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2191 ; SSE2-NEXT: pand %xmm6, %xmm6 2192 ; SSE2-NEXT: pand %xmm6, %xmm0 2193 ; SSE2-NEXT: por %xmm4, %xmm0 2194 ; SSE2-NEXT: por %xmm7, %xmm0 2195 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2196 ; SSE2-NEXT: psllw $5, %xmm4 2197 ; SSE2-NEXT: pand %xmm9, %xmm4 2198 ; SSE2-NEXT: movdqa %xmm1, %xmm7 2199 ; SSE2-NEXT: psllw $7, %xmm7 2200 ; SSE2-NEXT: pand %xmm10, %xmm7 2201 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2202 ; SSE2-NEXT: psllw $3, %xmm5 2203 ; SSE2-NEXT: pand %xmm11, %xmm5 2204 ; SSE2-NEXT: por %xmm4, %xmm5 2205 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2206 ; SSE2-NEXT: paddb %xmm4, %xmm4 2207 ; SSE2-NEXT: pand %xmm8, %xmm4 2208 ; SSE2-NEXT: por %xmm5, %xmm4 2209 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2210 ; SSE2-NEXT: psrlw $1, %xmm5 2211 ; SSE2-NEXT: pand %xmm12, %xmm5 2212 ; SSE2-NEXT: por %xmm4, %xmm5 2213 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2214 ; SSE2-NEXT: psrlw $3, %xmm4 2215 ; SSE2-NEXT: pand %xmm13, %xmm4 2216 ; SSE2-NEXT: por %xmm5, %xmm4 2217 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2218 ; SSE2-NEXT: psrlw $5, %xmm5 2219 ; SSE2-NEXT: pand %xmm14, %xmm5 2220 ; SSE2-NEXT: por %xmm4, %xmm5 2221 ; SSE2-NEXT: psrlw $7, %xmm1 2222 ; SSE2-NEXT: pand %xmm6, %xmm1 2223 ; SSE2-NEXT: por %xmm5, %xmm1 2224 ; SSE2-NEXT: por %xmm7, %xmm1 2225 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2226 ; SSE2-NEXT: psllw $5, %xmm4 2227 ; SSE2-NEXT: pand %xmm9, %xmm4 2228 ; SSE2-NEXT: movdqa %xmm2, %xmm7 2229 ; SSE2-NEXT: psllw $7, %xmm7 2230 ; SSE2-NEXT: pand %xmm10, %xmm7 2231 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2232 ; SSE2-NEXT: psllw $3, %xmm5 2233 ; SSE2-NEXT: pand %xmm11, %xmm5 2234 ; SSE2-NEXT: por %xmm4, %xmm5 2235 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2236 ; SSE2-NEXT: paddb %xmm4, %xmm4 2237 ; SSE2-NEXT: pand %xmm8, %xmm4 2238 ; SSE2-NEXT: por %xmm5, %xmm4 2239 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2240 ; SSE2-NEXT: psrlw $1, %xmm5 2241 ; SSE2-NEXT: pand %xmm12, %xmm5 2242 ; SSE2-NEXT: por %xmm4, %xmm5 2243 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2244 ; SSE2-NEXT: psrlw $3, %xmm4 2245 ; SSE2-NEXT: pand %xmm13, %xmm4 2246 ; SSE2-NEXT: por %xmm5, %xmm4 2247 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2248 ; SSE2-NEXT: psrlw $5, %xmm5 2249 ; SSE2-NEXT: pand %xmm14, %xmm5 2250 ; SSE2-NEXT: por %xmm4, %xmm5 2251 ; SSE2-NEXT: psrlw $7, %xmm2 2252 ; SSE2-NEXT: pand %xmm6, %xmm2 2253 ; SSE2-NEXT: por %xmm5, %xmm2 2254 ; SSE2-NEXT: por %xmm7, %xmm2 2255 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2256 ; SSE2-NEXT: psllw $5, %xmm4 2257 ; SSE2-NEXT: pand %xmm9, %xmm4 2258 ; SSE2-NEXT: movdqa %xmm3, %xmm7 2259 ; SSE2-NEXT: psllw $7, %xmm7 2260 ; SSE2-NEXT: pand %xmm10, %xmm7 2261 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2262 ; SSE2-NEXT: psllw $3, %xmm5 2263 ; SSE2-NEXT: pand %xmm11, %xmm5 2264 ; SSE2-NEXT: por %xmm4, %xmm5 2265 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2266 ; SSE2-NEXT: paddb %xmm4, %xmm4 2267 ; SSE2-NEXT: pand %xmm8, %xmm4 2268 ; SSE2-NEXT: por %xmm5, %xmm4 2269 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2270 ; SSE2-NEXT: psrlw $1, %xmm5 2271 ; SSE2-NEXT: pand %xmm12, %xmm5 2272 ; SSE2-NEXT: por %xmm4, %xmm5 2273 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2274 ; SSE2-NEXT: psrlw $3, %xmm4 2275 ; SSE2-NEXT: pand %xmm13, %xmm4 2276 ; SSE2-NEXT: por %xmm5, %xmm4 2277 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2278 ; SSE2-NEXT: psrlw $5, %xmm5 2279 ; SSE2-NEXT: pand %xmm14, %xmm5 2280 ; SSE2-NEXT: por %xmm4, %xmm5 2281 ; SSE2-NEXT: psrlw $7, %xmm3 2282 ; SSE2-NEXT: pand %xmm6, %xmm3 2283 ; SSE2-NEXT: por %xmm5, %xmm3 2284 ; SSE2-NEXT: por %xmm7, %xmm3 2285 ; SSE2-NEXT: retq 2286 ; 2287 ; SSSE3-LABEL: test_bitreverse_v64i8: 2288 ; SSSE3: # BB#0: 2289 ; SSSE3-NEXT: movdqa %xmm0, %xmm5 2290 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2291 ; SSSE3-NEXT: pand %xmm8, %xmm0 2292 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2293 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 2294 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 2295 ; SSSE3-NEXT: psrlw $4, %xmm5 2296 ; SSSE3-NEXT: pand %xmm8, %xmm5 2297 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2298 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 2299 ; SSSE3-NEXT: pshufb %xmm5, %xmm0 2300 ; SSSE3-NEXT: por %xmm6, %xmm0 2301 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 2302 ; SSSE3-NEXT: pand %xmm8, %xmm5 2303 ; SSSE3-NEXT: movdqa %xmm9, %xmm6 2304 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 2305 ; SSSE3-NEXT: psrlw $4, %xmm1 2306 ; SSSE3-NEXT: pand %xmm8, %xmm1 2307 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 2308 ; SSSE3-NEXT: pshufb %xmm1, %xmm5 2309 ; SSSE3-NEXT: por %xmm6, %xmm5 2310 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 2311 ; SSSE3-NEXT: pand %xmm8, %xmm1 2312 ; SSSE3-NEXT: movdqa %xmm9, %xmm7 2313 ; SSSE3-NEXT: pshufb %xmm1, %xmm7 2314 ; SSSE3-NEXT: psrlw $4, %xmm2 2315 ; SSSE3-NEXT: pand %xmm8, %xmm2 2316 ; SSSE3-NEXT: movdqa %xmm4, %xmm6 2317 ; SSSE3-NEXT: pshufb %xmm2, %xmm6 2318 ; SSSE3-NEXT: por %xmm7, %xmm6 2319 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 2320 ; SSSE3-NEXT: pand %xmm8, %xmm1 2321 ; SSSE3-NEXT: pshufb %xmm1, %xmm9 2322 ; SSSE3-NEXT: psrlw $4, %xmm3 2323 ; SSSE3-NEXT: pand %xmm8, %xmm3 2324 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 2325 ; SSSE3-NEXT: por %xmm9, %xmm4 2326 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 2327 ; SSSE3-NEXT: movdqa %xmm6, %xmm2 2328 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 2329 ; SSSE3-NEXT: retq 2330 ; 2331 ; AVX1-LABEL: test_bitreverse_v64i8: 2332 ; AVX1: # BB#0: 2333 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2334 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2335 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4 2336 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2337 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2338 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2339 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2340 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2341 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 2342 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 2343 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm4 2344 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2345 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2346 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2347 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 2348 ; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 2349 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2350 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2351 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm4 2352 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2353 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2354 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2355 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 2356 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 2357 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm4 2358 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 2359 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2360 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2361 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 2362 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 2363 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2364 ; AVX1-NEXT: retq 2365 ; 2366 ; AVX2-LABEL: test_bitreverse_v64i8: 2367 ; AVX2: # BB#0: 2368 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2369 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 2370 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2371 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2372 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2373 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2374 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2375 ; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 2376 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 2377 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 2378 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2379 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2380 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2381 ; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 2382 ; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1 2383 ; AVX2-NEXT: retq 2384 ; 2385 ; AVX512F-LABEL: test_bitreverse_v64i8: 2386 ; AVX512F: # BB#0: 2387 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2388 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 2389 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2390 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2391 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2392 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 2393 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2394 ; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 2395 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 2396 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 2397 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 2398 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2399 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 2400 ; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 2401 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 2402 ; AVX512F-NEXT: retq 2403 ; 2404 ; AVX512BW-LABEL: test_bitreverse_v64i8: 2405 ; AVX512BW: # BB#0: 2406 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2407 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2408 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2409 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2410 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2411 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2412 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2413 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2414 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2415 ; AVX512BW-NEXT: retq 2416 ; 2417 ; XOPAVX1-LABEL: test_bitreverse_v64i8: 2418 ; XOPAVX1: # BB#0: 2419 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2420 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2421 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2422 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2423 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2424 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2425 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2426 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2427 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2428 ; XOPAVX1-NEXT: retq 2429 ; 2430 ; XOPAVX2-LABEL: test_bitreverse_v64i8: 2431 ; XOPAVX2: # BB#0: 2432 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2433 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95] 2434 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2435 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2436 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2437 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2438 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2439 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2440 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2441 ; XOPAVX2-NEXT: retq 2442 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) 2443 ret <64 x i8> %b 2444 } 2445 2446 define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { 2447 ; SSE2-LABEL: test_bitreverse_v32i16: 2448 ; SSE2: # BB#0: 2449 ; SSE2-NEXT: pxor %xmm9, %xmm9 2450 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2451 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2452 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2453 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2454 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 2455 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2456 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 2457 ; SSE2-NEXT: packuswb %xmm4, %xmm0 2458 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2459 ; SSE2-NEXT: psllw $5, %xmm5 2460 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 2461 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 2462 ; SSE2-NEXT: pand %xmm10, %xmm5 2463 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2464 ; SSE2-NEXT: psllw $7, %xmm4 2465 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 2466 ; SSE2-NEXT: pand %xmm11, %xmm11 2467 ; SSE2-NEXT: pand %xmm11, %xmm4 2468 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2469 ; SSE2-NEXT: psllw $3, %xmm6 2470 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 2471 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 2472 ; SSE2-NEXT: pand %xmm12, %xmm6 2473 ; SSE2-NEXT: por %xmm5, %xmm6 2474 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2475 ; SSE2-NEXT: paddb %xmm5, %xmm5 2476 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2477 ; SSE2-NEXT: pand %xmm8, %xmm5 2478 ; SSE2-NEXT: por %xmm6, %xmm5 2479 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2480 ; SSE2-NEXT: psrlw $1, %xmm6 2481 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 2482 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 2483 ; SSE2-NEXT: pand %xmm13, %xmm6 2484 ; SSE2-NEXT: por %xmm5, %xmm6 2485 ; SSE2-NEXT: movdqa %xmm0, %xmm7 2486 ; SSE2-NEXT: psrlw $3, %xmm7 2487 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 2488 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 2489 ; SSE2-NEXT: pand %xmm14, %xmm7 2490 ; SSE2-NEXT: por %xmm6, %xmm7 2491 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2492 ; SSE2-NEXT: psrlw $5, %xmm5 2493 ; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2494 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm15 2495 ; SSE2-NEXT: pand %xmm15, %xmm5 2496 ; SSE2-NEXT: por %xmm7, %xmm5 2497 ; SSE2-NEXT: psrlw $7, %xmm0 2498 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2499 ; SSE2-NEXT: pand %xmm7, %xmm7 2500 ; SSE2-NEXT: pand %xmm7, %xmm0 2501 ; SSE2-NEXT: por %xmm5, %xmm0 2502 ; SSE2-NEXT: por %xmm4, %xmm0 2503 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2504 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2505 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2506 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2507 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 2508 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] 2509 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6] 2510 ; SSE2-NEXT: packuswb %xmm4, %xmm1 2511 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2512 ; SSE2-NEXT: psllw $5, %xmm5 2513 ; SSE2-NEXT: pand %xmm10, %xmm5 2514 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2515 ; SSE2-NEXT: psllw $7, %xmm4 2516 ; SSE2-NEXT: pand %xmm11, %xmm4 2517 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2518 ; SSE2-NEXT: psllw $3, %xmm6 2519 ; SSE2-NEXT: pand %xmm12, %xmm6 2520 ; SSE2-NEXT: por %xmm5, %xmm6 2521 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2522 ; SSE2-NEXT: paddb %xmm5, %xmm5 2523 ; SSE2-NEXT: pand %xmm8, %xmm5 2524 ; SSE2-NEXT: por %xmm6, %xmm5 2525 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2526 ; SSE2-NEXT: psrlw $1, %xmm6 2527 ; SSE2-NEXT: pand %xmm13, %xmm6 2528 ; SSE2-NEXT: por %xmm5, %xmm6 2529 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2530 ; SSE2-NEXT: psrlw $3, %xmm5 2531 ; SSE2-NEXT: pand %xmm14, %xmm5 2532 ; SSE2-NEXT: por %xmm6, %xmm5 2533 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2534 ; SSE2-NEXT: psrlw $5, %xmm6 2535 ; SSE2-NEXT: pand %xmm15, %xmm6 2536 ; SSE2-NEXT: por %xmm5, %xmm6 2537 ; SSE2-NEXT: psrlw $7, %xmm1 2538 ; SSE2-NEXT: pand %xmm7, %xmm1 2539 ; SSE2-NEXT: por %xmm6, %xmm1 2540 ; SSE2-NEXT: por %xmm4, %xmm1 2541 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2542 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2543 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2544 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2545 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 2546 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] 2547 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6] 2548 ; SSE2-NEXT: packuswb %xmm4, %xmm2 2549 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2550 ; SSE2-NEXT: psllw $5, %xmm5 2551 ; SSE2-NEXT: pand %xmm10, %xmm5 2552 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2553 ; SSE2-NEXT: psllw $7, %xmm4 2554 ; SSE2-NEXT: pand %xmm11, %xmm4 2555 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2556 ; SSE2-NEXT: psllw $3, %xmm6 2557 ; SSE2-NEXT: pand %xmm12, %xmm6 2558 ; SSE2-NEXT: por %xmm5, %xmm6 2559 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2560 ; SSE2-NEXT: paddb %xmm5, %xmm5 2561 ; SSE2-NEXT: pand %xmm8, %xmm5 2562 ; SSE2-NEXT: por %xmm6, %xmm5 2563 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2564 ; SSE2-NEXT: psrlw $1, %xmm6 2565 ; SSE2-NEXT: pand %xmm13, %xmm6 2566 ; SSE2-NEXT: por %xmm5, %xmm6 2567 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2568 ; SSE2-NEXT: psrlw $3, %xmm5 2569 ; SSE2-NEXT: pand %xmm14, %xmm5 2570 ; SSE2-NEXT: por %xmm6, %xmm5 2571 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2572 ; SSE2-NEXT: psrlw $5, %xmm6 2573 ; SSE2-NEXT: pand %xmm15, %xmm6 2574 ; SSE2-NEXT: por %xmm5, %xmm6 2575 ; SSE2-NEXT: psrlw $7, %xmm2 2576 ; SSE2-NEXT: pand %xmm7, %xmm2 2577 ; SSE2-NEXT: por %xmm6, %xmm2 2578 ; SSE2-NEXT: por %xmm4, %xmm2 2579 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2580 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2581 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] 2582 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6] 2583 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 2584 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] 2585 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6] 2586 ; SSE2-NEXT: packuswb %xmm4, %xmm3 2587 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2588 ; SSE2-NEXT: psllw $5, %xmm5 2589 ; SSE2-NEXT: pand %xmm10, %xmm5 2590 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2591 ; SSE2-NEXT: psllw $7, %xmm4 2592 ; SSE2-NEXT: pand %xmm11, %xmm4 2593 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2594 ; SSE2-NEXT: psllw $3, %xmm6 2595 ; SSE2-NEXT: pand %xmm12, %xmm6 2596 ; SSE2-NEXT: por %xmm5, %xmm6 2597 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2598 ; SSE2-NEXT: paddb %xmm5, %xmm5 2599 ; SSE2-NEXT: pand %xmm8, %xmm5 2600 ; SSE2-NEXT: por %xmm6, %xmm5 2601 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2602 ; SSE2-NEXT: psrlw $1, %xmm6 2603 ; SSE2-NEXT: pand %xmm13, %xmm6 2604 ; SSE2-NEXT: por %xmm5, %xmm6 2605 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2606 ; SSE2-NEXT: psrlw $3, %xmm5 2607 ; SSE2-NEXT: pand %xmm14, %xmm5 2608 ; SSE2-NEXT: por %xmm6, %xmm5 2609 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2610 ; SSE2-NEXT: psrlw $5, %xmm6 2611 ; SSE2-NEXT: pand %xmm15, %xmm6 2612 ; SSE2-NEXT: por %xmm5, %xmm6 2613 ; SSE2-NEXT: psrlw $7, %xmm3 2614 ; SSE2-NEXT: pand %xmm7, %xmm3 2615 ; SSE2-NEXT: por %xmm6, %xmm3 2616 ; SSE2-NEXT: por %xmm4, %xmm3 2617 ; SSE2-NEXT: retq 2618 ; 2619 ; SSSE3-LABEL: test_bitreverse_v32i16: 2620 ; SSSE3: # BB#0: 2621 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 2622 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 2623 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2624 ; SSSE3-NEXT: pshufb %xmm8, %xmm1 2625 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2626 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 2627 ; SSSE3-NEXT: pand %xmm9, %xmm0 2628 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2629 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2630 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 2631 ; SSSE3-NEXT: psrlw $4, %xmm1 2632 ; SSSE3-NEXT: pand %xmm9, %xmm1 2633 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2634 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 2635 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 2636 ; SSSE3-NEXT: por %xmm6, %xmm0 2637 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 2638 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 2639 ; SSSE3-NEXT: pand %xmm9, %xmm1 2640 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2641 ; SSSE3-NEXT: pshufb %xmm1, %xmm6 2642 ; SSSE3-NEXT: psrlw $4, %xmm5 2643 ; SSSE3-NEXT: pand %xmm9, %xmm5 2644 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 2645 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 2646 ; SSSE3-NEXT: por %xmm6, %xmm1 2647 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 2648 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 2649 ; SSSE3-NEXT: pand %xmm9, %xmm5 2650 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2651 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 2652 ; SSSE3-NEXT: psrlw $4, %xmm2 2653 ; SSSE3-NEXT: pand %xmm9, %xmm2 2654 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 2655 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 2656 ; SSSE3-NEXT: por %xmm6, %xmm5 2657 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 2658 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 2659 ; SSSE3-NEXT: pand %xmm9, %xmm2 2660 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 2661 ; SSSE3-NEXT: psrlw $4, %xmm3 2662 ; SSSE3-NEXT: pand %xmm9, %xmm3 2663 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 2664 ; SSSE3-NEXT: por %xmm7, %xmm4 2665 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 2666 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 2667 ; SSSE3-NEXT: retq 2668 ; 2669 ; AVX1-LABEL: test_bitreverse_v32i16: 2670 ; AVX1: # BB#0: 2671 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2672 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2673 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2674 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2675 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2676 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2677 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2678 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2679 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2680 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2681 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2682 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2683 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 2684 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 2685 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2686 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 2687 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2688 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 2689 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 2690 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2691 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2692 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2693 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 2694 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 2695 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2696 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2697 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 2698 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 2699 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2700 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 2701 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 2702 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2703 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2704 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 2705 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 2706 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2707 ; AVX1-NEXT: retq 2708 ; 2709 ; AVX2-LABEL: test_bitreverse_v32i16: 2710 ; AVX2: # BB#0: 2711 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2712 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2713 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2714 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 2715 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2716 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2717 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 2718 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2719 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2720 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2721 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 2722 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2723 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 2724 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2725 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2726 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2727 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2728 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 2729 ; AVX2-NEXT: retq 2730 ; 2731 ; AVX512F-LABEL: test_bitreverse_v32i16: 2732 ; AVX512F: # BB#0: 2733 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 2734 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2735 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2736 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4 2737 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2738 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 2739 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 2740 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 2741 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2742 ; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 2743 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 2744 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2745 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2 2746 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 2747 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2748 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 2749 ; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 2750 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 2751 ; AVX512F-NEXT: retq 2752 ; 2753 ; AVX512BW-LABEL: test_bitreverse_v32i16: 2754 ; AVX512BW: # BB#0: 2755 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] 2756 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2757 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 2758 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2759 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 2760 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 2761 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 2762 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2763 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 2764 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 2765 ; AVX512BW-NEXT: retq 2766 ; 2767 ; XOPAVX1-LABEL: test_bitreverse_v32i16: 2768 ; XOPAVX1: # BB#0: 2769 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2770 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2771 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2772 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2773 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2774 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2775 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2776 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2777 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2778 ; XOPAVX1-NEXT: retq 2779 ; 2780 ; XOPAVX2-LABEL: test_bitreverse_v32i16: 2781 ; XOPAVX2: # BB#0: 2782 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2783 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94] 2784 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2785 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 2786 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2787 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2788 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 2789 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 2790 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2791 ; XOPAVX2-NEXT: retq 2792 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) 2793 ret <32 x i16> %b 2794 } 2795 2796 define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { 2797 ; SSE2-LABEL: test_bitreverse_v16i32: 2798 ; SSE2: # BB#0: 2799 ; SSE2-NEXT: pxor %xmm9, %xmm9 2800 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2801 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2802 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2803 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2804 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 2805 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 2806 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 2807 ; SSE2-NEXT: packuswb %xmm4, %xmm0 2808 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2809 ; SSE2-NEXT: psllw $5, %xmm5 2810 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 2811 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 2812 ; SSE2-NEXT: pand %xmm10, %xmm5 2813 ; SSE2-NEXT: movdqa %xmm0, %xmm4 2814 ; SSE2-NEXT: psllw $7, %xmm4 2815 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 2816 ; SSE2-NEXT: pand %xmm11, %xmm11 2817 ; SSE2-NEXT: pand %xmm11, %xmm4 2818 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2819 ; SSE2-NEXT: psllw $3, %xmm6 2820 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 2821 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 2822 ; SSE2-NEXT: pand %xmm12, %xmm6 2823 ; SSE2-NEXT: por %xmm5, %xmm6 2824 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2825 ; SSE2-NEXT: paddb %xmm5, %xmm5 2826 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 2827 ; SSE2-NEXT: pand %xmm8, %xmm5 2828 ; SSE2-NEXT: por %xmm6, %xmm5 2829 ; SSE2-NEXT: movdqa %xmm0, %xmm6 2830 ; SSE2-NEXT: psrlw $1, %xmm6 2831 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 2832 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 2833 ; SSE2-NEXT: pand %xmm13, %xmm6 2834 ; SSE2-NEXT: por %xmm5, %xmm6 2835 ; SSE2-NEXT: movdqa %xmm0, %xmm7 2836 ; SSE2-NEXT: psrlw $3, %xmm7 2837 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 2838 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 2839 ; SSE2-NEXT: pand %xmm14, %xmm7 2840 ; SSE2-NEXT: por %xmm6, %xmm7 2841 ; SSE2-NEXT: movdqa %xmm0, %xmm5 2842 ; SSE2-NEXT: psrlw $5, %xmm5 2843 ; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2844 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm15 2845 ; SSE2-NEXT: pand %xmm15, %xmm5 2846 ; SSE2-NEXT: por %xmm7, %xmm5 2847 ; SSE2-NEXT: psrlw $7, %xmm0 2848 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2849 ; SSE2-NEXT: pand %xmm7, %xmm7 2850 ; SSE2-NEXT: pand %xmm7, %xmm0 2851 ; SSE2-NEXT: por %xmm5, %xmm0 2852 ; SSE2-NEXT: por %xmm4, %xmm0 2853 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2854 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2855 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2856 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2857 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 2858 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 2859 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 2860 ; SSE2-NEXT: packuswb %xmm4, %xmm1 2861 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2862 ; SSE2-NEXT: psllw $5, %xmm5 2863 ; SSE2-NEXT: pand %xmm10, %xmm5 2864 ; SSE2-NEXT: movdqa %xmm1, %xmm4 2865 ; SSE2-NEXT: psllw $7, %xmm4 2866 ; SSE2-NEXT: pand %xmm11, %xmm4 2867 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2868 ; SSE2-NEXT: psllw $3, %xmm6 2869 ; SSE2-NEXT: pand %xmm12, %xmm6 2870 ; SSE2-NEXT: por %xmm5, %xmm6 2871 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2872 ; SSE2-NEXT: paddb %xmm5, %xmm5 2873 ; SSE2-NEXT: pand %xmm8, %xmm5 2874 ; SSE2-NEXT: por %xmm6, %xmm5 2875 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2876 ; SSE2-NEXT: psrlw $1, %xmm6 2877 ; SSE2-NEXT: pand %xmm13, %xmm6 2878 ; SSE2-NEXT: por %xmm5, %xmm6 2879 ; SSE2-NEXT: movdqa %xmm1, %xmm5 2880 ; SSE2-NEXT: psrlw $3, %xmm5 2881 ; SSE2-NEXT: pand %xmm14, %xmm5 2882 ; SSE2-NEXT: por %xmm6, %xmm5 2883 ; SSE2-NEXT: movdqa %xmm1, %xmm6 2884 ; SSE2-NEXT: psrlw $5, %xmm6 2885 ; SSE2-NEXT: pand %xmm15, %xmm6 2886 ; SSE2-NEXT: por %xmm5, %xmm6 2887 ; SSE2-NEXT: psrlw $7, %xmm1 2888 ; SSE2-NEXT: pand %xmm7, %xmm1 2889 ; SSE2-NEXT: por %xmm6, %xmm1 2890 ; SSE2-NEXT: por %xmm4, %xmm1 2891 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2892 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2893 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2894 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2895 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 2896 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 2897 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 2898 ; SSE2-NEXT: packuswb %xmm4, %xmm2 2899 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2900 ; SSE2-NEXT: psllw $5, %xmm5 2901 ; SSE2-NEXT: pand %xmm10, %xmm5 2902 ; SSE2-NEXT: movdqa %xmm2, %xmm4 2903 ; SSE2-NEXT: psllw $7, %xmm4 2904 ; SSE2-NEXT: pand %xmm11, %xmm4 2905 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2906 ; SSE2-NEXT: psllw $3, %xmm6 2907 ; SSE2-NEXT: pand %xmm12, %xmm6 2908 ; SSE2-NEXT: por %xmm5, %xmm6 2909 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2910 ; SSE2-NEXT: paddb %xmm5, %xmm5 2911 ; SSE2-NEXT: pand %xmm8, %xmm5 2912 ; SSE2-NEXT: por %xmm6, %xmm5 2913 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2914 ; SSE2-NEXT: psrlw $1, %xmm6 2915 ; SSE2-NEXT: pand %xmm13, %xmm6 2916 ; SSE2-NEXT: por %xmm5, %xmm6 2917 ; SSE2-NEXT: movdqa %xmm2, %xmm5 2918 ; SSE2-NEXT: psrlw $3, %xmm5 2919 ; SSE2-NEXT: pand %xmm14, %xmm5 2920 ; SSE2-NEXT: por %xmm6, %xmm5 2921 ; SSE2-NEXT: movdqa %xmm2, %xmm6 2922 ; SSE2-NEXT: psrlw $5, %xmm6 2923 ; SSE2-NEXT: pand %xmm15, %xmm6 2924 ; SSE2-NEXT: por %xmm5, %xmm6 2925 ; SSE2-NEXT: psrlw $7, %xmm2 2926 ; SSE2-NEXT: pand %xmm7, %xmm2 2927 ; SSE2-NEXT: por %xmm6, %xmm2 2928 ; SSE2-NEXT: por %xmm4, %xmm2 2929 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2930 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 2931 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 2932 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 2933 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 2934 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 2935 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 2936 ; SSE2-NEXT: packuswb %xmm4, %xmm3 2937 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2938 ; SSE2-NEXT: psllw $5, %xmm5 2939 ; SSE2-NEXT: pand %xmm10, %xmm5 2940 ; SSE2-NEXT: movdqa %xmm3, %xmm4 2941 ; SSE2-NEXT: psllw $7, %xmm4 2942 ; SSE2-NEXT: pand %xmm11, %xmm4 2943 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2944 ; SSE2-NEXT: psllw $3, %xmm6 2945 ; SSE2-NEXT: pand %xmm12, %xmm6 2946 ; SSE2-NEXT: por %xmm5, %xmm6 2947 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2948 ; SSE2-NEXT: paddb %xmm5, %xmm5 2949 ; SSE2-NEXT: pand %xmm8, %xmm5 2950 ; SSE2-NEXT: por %xmm6, %xmm5 2951 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2952 ; SSE2-NEXT: psrlw $1, %xmm6 2953 ; SSE2-NEXT: pand %xmm13, %xmm6 2954 ; SSE2-NEXT: por %xmm5, %xmm6 2955 ; SSE2-NEXT: movdqa %xmm3, %xmm5 2956 ; SSE2-NEXT: psrlw $3, %xmm5 2957 ; SSE2-NEXT: pand %xmm14, %xmm5 2958 ; SSE2-NEXT: por %xmm6, %xmm5 2959 ; SSE2-NEXT: movdqa %xmm3, %xmm6 2960 ; SSE2-NEXT: psrlw $5, %xmm6 2961 ; SSE2-NEXT: pand %xmm15, %xmm6 2962 ; SSE2-NEXT: por %xmm5, %xmm6 2963 ; SSE2-NEXT: psrlw $7, %xmm3 2964 ; SSE2-NEXT: pand %xmm7, %xmm3 2965 ; SSE2-NEXT: por %xmm6, %xmm3 2966 ; SSE2-NEXT: por %xmm4, %xmm3 2967 ; SSE2-NEXT: retq 2968 ; 2969 ; SSSE3-LABEL: test_bitreverse_v16i32: 2970 ; SSSE3: # BB#0: 2971 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 2972 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 2973 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 2974 ; SSSE3-NEXT: pshufb %xmm8, %xmm1 2975 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2976 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 2977 ; SSSE3-NEXT: pand %xmm9, %xmm0 2978 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 2979 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2980 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 2981 ; SSSE3-NEXT: psrlw $4, %xmm1 2982 ; SSSE3-NEXT: pand %xmm9, %xmm1 2983 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 2984 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 2985 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 2986 ; SSSE3-NEXT: por %xmm6, %xmm0 2987 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 2988 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 2989 ; SSSE3-NEXT: pand %xmm9, %xmm1 2990 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 2991 ; SSSE3-NEXT: pshufb %xmm1, %xmm6 2992 ; SSSE3-NEXT: psrlw $4, %xmm5 2993 ; SSSE3-NEXT: pand %xmm9, %xmm5 2994 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 2995 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 2996 ; SSSE3-NEXT: por %xmm6, %xmm1 2997 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 2998 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 2999 ; SSSE3-NEXT: pand %xmm9, %xmm5 3000 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 3001 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 3002 ; SSSE3-NEXT: psrlw $4, %xmm2 3003 ; SSSE3-NEXT: pand %xmm9, %xmm2 3004 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 3005 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 3006 ; SSSE3-NEXT: por %xmm6, %xmm5 3007 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 3008 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 3009 ; SSSE3-NEXT: pand %xmm9, %xmm2 3010 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 3011 ; SSSE3-NEXT: psrlw $4, %xmm3 3012 ; SSSE3-NEXT: pand %xmm9, %xmm3 3013 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 3014 ; SSSE3-NEXT: por %xmm7, %xmm4 3015 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 3016 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 3017 ; SSSE3-NEXT: retq 3018 ; 3019 ; AVX1-LABEL: test_bitreverse_v16i32: 3020 ; AVX1: # BB#0: 3021 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3022 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 3023 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3024 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3025 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3026 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3027 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3028 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3029 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3030 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3031 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3032 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3033 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3034 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 3035 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3036 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 3037 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3038 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3039 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 3040 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3041 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3042 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3043 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3044 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3045 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3046 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3047 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3048 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3049 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3050 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 3051 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3052 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 3053 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3054 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3055 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 3056 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3057 ; AVX1-NEXT: retq 3058 ; 3059 ; AVX2-LABEL: test_bitreverse_v16i32: 3060 ; AVX2: # BB#0: 3061 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 3062 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3063 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3064 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 3065 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3066 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3067 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 3068 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 3069 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3070 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 3071 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 3072 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3073 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 3074 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3075 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 3076 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3077 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 3078 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 3079 ; AVX2-NEXT: retq 3080 ; 3081 ; AVX512F-LABEL: test_bitreverse_v16i32: 3082 ; AVX512F: # BB#0: 3083 ; AVX512F-NEXT: vpslld $29, %zmm0, %zmm1 3084 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 3085 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm2 3086 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3087 ; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1 3088 ; AVX512F-NEXT: vpslld $27, %zmm0, %zmm2 3089 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3090 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3091 ; AVX512F-NEXT: vpslld $25, %zmm0, %zmm2 3092 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3093 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3094 ; AVX512F-NEXT: vpslld $23, %zmm0, %zmm2 3095 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3096 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3097 ; AVX512F-NEXT: vpslld $21, %zmm0, %zmm2 3098 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3099 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3100 ; AVX512F-NEXT: vpslld $19, %zmm0, %zmm2 3101 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3102 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3103 ; AVX512F-NEXT: vpslld $17, %zmm0, %zmm2 3104 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3105 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3106 ; AVX512F-NEXT: vpslld $15, %zmm0, %zmm2 3107 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3108 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3109 ; AVX512F-NEXT: vpslld $13, %zmm0, %zmm2 3110 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3111 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3112 ; AVX512F-NEXT: vpslld $11, %zmm0, %zmm2 3113 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3114 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3115 ; AVX512F-NEXT: vpslld $9, %zmm0, %zmm2 3116 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3117 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3118 ; AVX512F-NEXT: vpslld $7, %zmm0, %zmm2 3119 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3120 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3121 ; AVX512F-NEXT: vpslld $5, %zmm0, %zmm2 3122 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3123 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3124 ; AVX512F-NEXT: vpslld $3, %zmm0, %zmm2 3125 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3126 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3127 ; AVX512F-NEXT: vpslld $1, %zmm0, %zmm2 3128 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3129 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3130 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm2 3131 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3132 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3133 ; AVX512F-NEXT: vpsrld $3, %zmm0, %zmm2 3134 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3135 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3136 ; AVX512F-NEXT: vpsrld $5, %zmm0, %zmm2 3137 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3138 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3139 ; AVX512F-NEXT: vpsrld $7, %zmm0, %zmm2 3140 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3141 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3142 ; AVX512F-NEXT: vpsrld $9, %zmm0, %zmm2 3143 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3144 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3145 ; AVX512F-NEXT: vpsrld $11, %zmm0, %zmm2 3146 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3147 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3148 ; AVX512F-NEXT: vpsrld $13, %zmm0, %zmm2 3149 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3150 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3151 ; AVX512F-NEXT: vpsrld $15, %zmm0, %zmm2 3152 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3153 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3154 ; AVX512F-NEXT: vpsrld $17, %zmm0, %zmm2 3155 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3156 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3157 ; AVX512F-NEXT: vpsrld $19, %zmm0, %zmm2 3158 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3159 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3160 ; AVX512F-NEXT: vpsrld $21, %zmm0, %zmm2 3161 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3162 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3163 ; AVX512F-NEXT: vpsrld $23, %zmm0, %zmm2 3164 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3165 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3166 ; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm2 3167 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3168 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3169 ; AVX512F-NEXT: vpsrld $27, %zmm0, %zmm2 3170 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3171 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3172 ; AVX512F-NEXT: vpsrld $29, %zmm0, %zmm2 3173 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 3174 ; AVX512F-NEXT: vpord %zmm2, %zmm1, %zmm1 3175 ; AVX512F-NEXT: vpsrld $31, %zmm0, %zmm0 3176 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 3177 ; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0 3178 ; AVX512F-NEXT: retq 3179 ; 3180 ; AVX512BW-LABEL: test_bitreverse_v16i32: 3181 ; AVX512BW: # BB#0: 3182 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] 3183 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3184 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 3185 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3186 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 3187 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 3188 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3189 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3190 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 3191 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 3192 ; AVX512BW-NEXT: retq 3193 ; 3194 ; XOPAVX1-LABEL: test_bitreverse_v16i32: 3195 ; XOPAVX1: # BB#0: 3196 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3197 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 3198 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3199 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3200 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3201 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3202 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3203 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3204 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3205 ; XOPAVX1-NEXT: retq 3206 ; 3207 ; XOPAVX2-LABEL: test_bitreverse_v16i32: 3208 ; XOPAVX2: # BB#0: 3209 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3210 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92] 3211 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3212 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3213 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3214 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3215 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3216 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3217 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3218 ; XOPAVX2-NEXT: retq 3219 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) 3220 ret <16 x i32> %b 3221 } 3222 3223 define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { 3224 ; SSE2-LABEL: test_bitreverse_v8i64: 3225 ; SSE2: # BB#0: 3226 ; SSE2-NEXT: pxor %xmm9, %xmm9 3227 ; SSE2-NEXT: movdqa %xmm0, %xmm4 3228 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3229 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3230 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3231 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3232 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 3233 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 3234 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 3235 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 3236 ; SSE2-NEXT: packuswb %xmm4, %xmm0 3237 ; SSE2-NEXT: movdqa %xmm0, %xmm5 3238 ; SSE2-NEXT: psllw $5, %xmm5 3239 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 3240 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm10 3241 ; SSE2-NEXT: pand %xmm10, %xmm5 3242 ; SSE2-NEXT: movdqa %xmm0, %xmm4 3243 ; SSE2-NEXT: psllw $7, %xmm4 3244 ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 3245 ; SSE2-NEXT: pand %xmm11, %xmm11 3246 ; SSE2-NEXT: pand %xmm11, %xmm4 3247 ; SSE2-NEXT: movdqa %xmm0, %xmm6 3248 ; SSE2-NEXT: psllw $3, %xmm6 3249 ; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 3250 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm12 3251 ; SSE2-NEXT: pand %xmm12, %xmm6 3252 ; SSE2-NEXT: por %xmm5, %xmm6 3253 ; SSE2-NEXT: movdqa %xmm0, %xmm5 3254 ; SSE2-NEXT: paddb %xmm5, %xmm5 3255 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 3256 ; SSE2-NEXT: pand %xmm8, %xmm5 3257 ; SSE2-NEXT: por %xmm6, %xmm5 3258 ; SSE2-NEXT: movdqa %xmm0, %xmm6 3259 ; SSE2-NEXT: psrlw $1, %xmm6 3260 ; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 3261 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm13 3262 ; SSE2-NEXT: pand %xmm13, %xmm6 3263 ; SSE2-NEXT: por %xmm5, %xmm6 3264 ; SSE2-NEXT: movdqa %xmm0, %xmm7 3265 ; SSE2-NEXT: psrlw $3, %xmm7 3266 ; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 3267 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm14 3268 ; SSE2-NEXT: pand %xmm14, %xmm7 3269 ; SSE2-NEXT: por %xmm6, %xmm7 3270 ; SSE2-NEXT: movdqa %xmm0, %xmm5 3271 ; SSE2-NEXT: psrlw $5, %xmm5 3272 ; SSE2-NEXT: movdqa {{.*#+}} xmm15 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 3273 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm15 3274 ; SSE2-NEXT: pand %xmm15, %xmm5 3275 ; SSE2-NEXT: por %xmm7, %xmm5 3276 ; SSE2-NEXT: psrlw $7, %xmm0 3277 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 3278 ; SSE2-NEXT: pand %xmm7, %xmm7 3279 ; SSE2-NEXT: pand %xmm7, %xmm0 3280 ; SSE2-NEXT: por %xmm5, %xmm0 3281 ; SSE2-NEXT: por %xmm4, %xmm0 3282 ; SSE2-NEXT: movdqa %xmm1, %xmm4 3283 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3284 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3285 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3286 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3287 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 3288 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 3289 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 3290 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 3291 ; SSE2-NEXT: packuswb %xmm4, %xmm1 3292 ; SSE2-NEXT: movdqa %xmm1, %xmm5 3293 ; SSE2-NEXT: psllw $5, %xmm5 3294 ; SSE2-NEXT: pand %xmm10, %xmm5 3295 ; SSE2-NEXT: movdqa %xmm1, %xmm4 3296 ; SSE2-NEXT: psllw $7, %xmm4 3297 ; SSE2-NEXT: pand %xmm11, %xmm4 3298 ; SSE2-NEXT: movdqa %xmm1, %xmm6 3299 ; SSE2-NEXT: psllw $3, %xmm6 3300 ; SSE2-NEXT: pand %xmm12, %xmm6 3301 ; SSE2-NEXT: por %xmm5, %xmm6 3302 ; SSE2-NEXT: movdqa %xmm1, %xmm5 3303 ; SSE2-NEXT: paddb %xmm5, %xmm5 3304 ; SSE2-NEXT: pand %xmm8, %xmm5 3305 ; SSE2-NEXT: por %xmm6, %xmm5 3306 ; SSE2-NEXT: movdqa %xmm1, %xmm6 3307 ; SSE2-NEXT: psrlw $1, %xmm6 3308 ; SSE2-NEXT: pand %xmm13, %xmm6 3309 ; SSE2-NEXT: por %xmm5, %xmm6 3310 ; SSE2-NEXT: movdqa %xmm1, %xmm5 3311 ; SSE2-NEXT: psrlw $3, %xmm5 3312 ; SSE2-NEXT: pand %xmm14, %xmm5 3313 ; SSE2-NEXT: por %xmm6, %xmm5 3314 ; SSE2-NEXT: movdqa %xmm1, %xmm6 3315 ; SSE2-NEXT: psrlw $5, %xmm6 3316 ; SSE2-NEXT: pand %xmm15, %xmm6 3317 ; SSE2-NEXT: por %xmm5, %xmm6 3318 ; SSE2-NEXT: psrlw $7, %xmm1 3319 ; SSE2-NEXT: pand %xmm7, %xmm1 3320 ; SSE2-NEXT: por %xmm6, %xmm1 3321 ; SSE2-NEXT: por %xmm4, %xmm1 3322 ; SSE2-NEXT: movdqa %xmm2, %xmm4 3323 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3324 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3325 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3326 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3327 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 3328 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 3329 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 3330 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 3331 ; SSE2-NEXT: packuswb %xmm4, %xmm2 3332 ; SSE2-NEXT: movdqa %xmm2, %xmm5 3333 ; SSE2-NEXT: psllw $5, %xmm5 3334 ; SSE2-NEXT: pand %xmm10, %xmm5 3335 ; SSE2-NEXT: movdqa %xmm2, %xmm4 3336 ; SSE2-NEXT: psllw $7, %xmm4 3337 ; SSE2-NEXT: pand %xmm11, %xmm4 3338 ; SSE2-NEXT: movdqa %xmm2, %xmm6 3339 ; SSE2-NEXT: psllw $3, %xmm6 3340 ; SSE2-NEXT: pand %xmm12, %xmm6 3341 ; SSE2-NEXT: por %xmm5, %xmm6 3342 ; SSE2-NEXT: movdqa %xmm2, %xmm5 3343 ; SSE2-NEXT: paddb %xmm5, %xmm5 3344 ; SSE2-NEXT: pand %xmm8, %xmm5 3345 ; SSE2-NEXT: por %xmm6, %xmm5 3346 ; SSE2-NEXT: movdqa %xmm2, %xmm6 3347 ; SSE2-NEXT: psrlw $1, %xmm6 3348 ; SSE2-NEXT: pand %xmm13, %xmm6 3349 ; SSE2-NEXT: por %xmm5, %xmm6 3350 ; SSE2-NEXT: movdqa %xmm2, %xmm5 3351 ; SSE2-NEXT: psrlw $3, %xmm5 3352 ; SSE2-NEXT: pand %xmm14, %xmm5 3353 ; SSE2-NEXT: por %xmm6, %xmm5 3354 ; SSE2-NEXT: movdqa %xmm2, %xmm6 3355 ; SSE2-NEXT: psrlw $5, %xmm6 3356 ; SSE2-NEXT: pand %xmm15, %xmm6 3357 ; SSE2-NEXT: por %xmm5, %xmm6 3358 ; SSE2-NEXT: psrlw $7, %xmm2 3359 ; SSE2-NEXT: pand %xmm7, %xmm2 3360 ; SSE2-NEXT: por %xmm6, %xmm2 3361 ; SSE2-NEXT: por %xmm4, %xmm2 3362 ; SSE2-NEXT: movdqa %xmm3, %xmm4 3363 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 3364 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 3365 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] 3366 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] 3367 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 3368 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 3369 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] 3370 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] 3371 ; SSE2-NEXT: packuswb %xmm4, %xmm3 3372 ; SSE2-NEXT: movdqa %xmm3, %xmm5 3373 ; SSE2-NEXT: psllw $5, %xmm5 3374 ; SSE2-NEXT: pand %xmm10, %xmm5 3375 ; SSE2-NEXT: movdqa %xmm3, %xmm4 3376 ; SSE2-NEXT: psllw $7, %xmm4 3377 ; SSE2-NEXT: pand %xmm11, %xmm4 3378 ; SSE2-NEXT: movdqa %xmm3, %xmm6 3379 ; SSE2-NEXT: psllw $3, %xmm6 3380 ; SSE2-NEXT: pand %xmm12, %xmm6 3381 ; SSE2-NEXT: por %xmm5, %xmm6 3382 ; SSE2-NEXT: movdqa %xmm3, %xmm5 3383 ; SSE2-NEXT: paddb %xmm5, %xmm5 3384 ; SSE2-NEXT: pand %xmm8, %xmm5 3385 ; SSE2-NEXT: por %xmm6, %xmm5 3386 ; SSE2-NEXT: movdqa %xmm3, %xmm6 3387 ; SSE2-NEXT: psrlw $1, %xmm6 3388 ; SSE2-NEXT: pand %xmm13, %xmm6 3389 ; SSE2-NEXT: por %xmm5, %xmm6 3390 ; SSE2-NEXT: movdqa %xmm3, %xmm5 3391 ; SSE2-NEXT: psrlw $3, %xmm5 3392 ; SSE2-NEXT: pand %xmm14, %xmm5 3393 ; SSE2-NEXT: por %xmm6, %xmm5 3394 ; SSE2-NEXT: movdqa %xmm3, %xmm6 3395 ; SSE2-NEXT: psrlw $5, %xmm6 3396 ; SSE2-NEXT: pand %xmm15, %xmm6 3397 ; SSE2-NEXT: por %xmm5, %xmm6 3398 ; SSE2-NEXT: psrlw $7, %xmm3 3399 ; SSE2-NEXT: pand %xmm7, %xmm3 3400 ; SSE2-NEXT: por %xmm6, %xmm3 3401 ; SSE2-NEXT: por %xmm4, %xmm3 3402 ; SSE2-NEXT: retq 3403 ; 3404 ; SSSE3-LABEL: test_bitreverse_v8i64: 3405 ; SSSE3: # BB#0: 3406 ; SSSE3-NEXT: movdqa %xmm1, %xmm5 3407 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 3408 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3409 ; SSSE3-NEXT: pshufb %xmm8, %xmm1 3410 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3411 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 3412 ; SSSE3-NEXT: pand %xmm9, %xmm0 3413 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3414 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 3415 ; SSSE3-NEXT: pshufb %xmm0, %xmm6 3416 ; SSSE3-NEXT: psrlw $4, %xmm1 3417 ; SSSE3-NEXT: pand %xmm9, %xmm1 3418 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3419 ; SSSE3-NEXT: movdqa %xmm4, %xmm0 3420 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 3421 ; SSSE3-NEXT: por %xmm6, %xmm0 3422 ; SSSE3-NEXT: pshufb %xmm8, %xmm5 3423 ; SSSE3-NEXT: movdqa %xmm5, %xmm1 3424 ; SSSE3-NEXT: pand %xmm9, %xmm1 3425 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 3426 ; SSSE3-NEXT: pshufb %xmm1, %xmm6 3427 ; SSSE3-NEXT: psrlw $4, %xmm5 3428 ; SSSE3-NEXT: pand %xmm9, %xmm5 3429 ; SSSE3-NEXT: movdqa %xmm4, %xmm1 3430 ; SSSE3-NEXT: pshufb %xmm5, %xmm1 3431 ; SSSE3-NEXT: por %xmm6, %xmm1 3432 ; SSSE3-NEXT: pshufb %xmm8, %xmm2 3433 ; SSSE3-NEXT: movdqa %xmm2, %xmm5 3434 ; SSSE3-NEXT: pand %xmm9, %xmm5 3435 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 3436 ; SSSE3-NEXT: pshufb %xmm5, %xmm6 3437 ; SSSE3-NEXT: psrlw $4, %xmm2 3438 ; SSSE3-NEXT: pand %xmm9, %xmm2 3439 ; SSSE3-NEXT: movdqa %xmm4, %xmm5 3440 ; SSSE3-NEXT: pshufb %xmm2, %xmm5 3441 ; SSSE3-NEXT: por %xmm6, %xmm5 3442 ; SSSE3-NEXT: pshufb %xmm8, %xmm3 3443 ; SSSE3-NEXT: movdqa %xmm3, %xmm2 3444 ; SSSE3-NEXT: pand %xmm9, %xmm2 3445 ; SSSE3-NEXT: pshufb %xmm2, %xmm7 3446 ; SSSE3-NEXT: psrlw $4, %xmm3 3447 ; SSSE3-NEXT: pand %xmm9, %xmm3 3448 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 3449 ; SSSE3-NEXT: por %xmm7, %xmm4 3450 ; SSSE3-NEXT: movdqa %xmm5, %xmm2 3451 ; SSSE3-NEXT: movdqa %xmm4, %xmm3 3452 ; SSSE3-NEXT: retq 3453 ; 3454 ; AVX1-LABEL: test_bitreverse_v8i64: 3455 ; AVX1: # BB#0: 3456 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3457 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3458 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3459 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3460 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3461 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3462 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3463 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3464 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3465 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3466 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3467 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3468 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 3469 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5 3470 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3471 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 3472 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 3473 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0 3474 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 3475 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3476 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3477 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3478 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 3479 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 3480 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 3481 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 3482 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 3483 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 3484 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 3485 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3 3486 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 3487 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 3488 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 3489 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1 3490 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 3491 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3492 ; AVX1-NEXT: retq 3493 ; 3494 ; AVX2-LABEL: test_bitreverse_v8i64: 3495 ; AVX2: # BB#0: 3496 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 3497 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3498 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3499 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 3500 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3501 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 3502 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 3503 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 3504 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3505 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 3506 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 3507 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3508 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 3509 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 3510 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 3511 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 3512 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 3513 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 3514 ; AVX2-NEXT: retq 3515 ; 3516 ; AVX512F-LABEL: test_bitreverse_v8i64: 3517 ; AVX512F: # BB#0: 3518 ; AVX512F-NEXT: vpsllq $61, %zmm0, %zmm1 3519 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 3520 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm2 3521 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3522 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 3523 ; AVX512F-NEXT: vpsllq $59, %zmm0, %zmm2 3524 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3525 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3526 ; AVX512F-NEXT: vpsllq $57, %zmm0, %zmm2 3527 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3528 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3529 ; AVX512F-NEXT: vpsllq $55, %zmm0, %zmm2 3530 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3531 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3532 ; AVX512F-NEXT: vpsllq $53, %zmm0, %zmm2 3533 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3534 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3535 ; AVX512F-NEXT: vpsllq $51, %zmm0, %zmm2 3536 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3537 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3538 ; AVX512F-NEXT: vpsllq $49, %zmm0, %zmm2 3539 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3540 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3541 ; AVX512F-NEXT: vpsllq $47, %zmm0, %zmm2 3542 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3543 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3544 ; AVX512F-NEXT: vpsllq $45, %zmm0, %zmm2 3545 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3546 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3547 ; AVX512F-NEXT: vpsllq $43, %zmm0, %zmm2 3548 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3549 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3550 ; AVX512F-NEXT: vpsllq $41, %zmm0, %zmm2 3551 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3552 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3553 ; AVX512F-NEXT: vpsllq $39, %zmm0, %zmm2 3554 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3555 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3556 ; AVX512F-NEXT: vpsllq $37, %zmm0, %zmm2 3557 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3558 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3559 ; AVX512F-NEXT: vpsllq $35, %zmm0, %zmm2 3560 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3561 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3562 ; AVX512F-NEXT: vpsllq $33, %zmm0, %zmm2 3563 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3564 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3565 ; AVX512F-NEXT: vpsllq $31, %zmm0, %zmm2 3566 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3567 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3568 ; AVX512F-NEXT: vpsllq $29, %zmm0, %zmm2 3569 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3570 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3571 ; AVX512F-NEXT: vpsllq $27, %zmm0, %zmm2 3572 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3573 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3574 ; AVX512F-NEXT: vpsllq $25, %zmm0, %zmm2 3575 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3576 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3577 ; AVX512F-NEXT: vpsllq $23, %zmm0, %zmm2 3578 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3579 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3580 ; AVX512F-NEXT: vpsllq $21, %zmm0, %zmm2 3581 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3582 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3583 ; AVX512F-NEXT: vpsllq $19, %zmm0, %zmm2 3584 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3585 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3586 ; AVX512F-NEXT: vpsllq $17, %zmm0, %zmm2 3587 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3588 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3589 ; AVX512F-NEXT: vpsllq $15, %zmm0, %zmm2 3590 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3591 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3592 ; AVX512F-NEXT: vpsllq $13, %zmm0, %zmm2 3593 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3594 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3595 ; AVX512F-NEXT: vpsllq $11, %zmm0, %zmm2 3596 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3597 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3598 ; AVX512F-NEXT: vpsllq $9, %zmm0, %zmm2 3599 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3600 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3601 ; AVX512F-NEXT: vpsllq $7, %zmm0, %zmm2 3602 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3603 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3604 ; AVX512F-NEXT: vpsllq $5, %zmm0, %zmm2 3605 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3606 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3607 ; AVX512F-NEXT: vpsllq $3, %zmm0, %zmm2 3608 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3609 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3610 ; AVX512F-NEXT: vpsllq $1, %zmm0, %zmm2 3611 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3612 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3613 ; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm2 3614 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3615 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3616 ; AVX512F-NEXT: vpsrlq $3, %zmm0, %zmm2 3617 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3618 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3619 ; AVX512F-NEXT: vpsrlq $5, %zmm0, %zmm2 3620 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3621 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3622 ; AVX512F-NEXT: vpsrlq $7, %zmm0, %zmm2 3623 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3624 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3625 ; AVX512F-NEXT: vpsrlq $9, %zmm0, %zmm2 3626 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3627 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3628 ; AVX512F-NEXT: vpsrlq $11, %zmm0, %zmm2 3629 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3630 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3631 ; AVX512F-NEXT: vpsrlq $13, %zmm0, %zmm2 3632 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3633 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3634 ; AVX512F-NEXT: vpsrlq $15, %zmm0, %zmm2 3635 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3636 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3637 ; AVX512F-NEXT: vpsrlq $17, %zmm0, %zmm2 3638 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3639 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3640 ; AVX512F-NEXT: vpsrlq $19, %zmm0, %zmm2 3641 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3642 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3643 ; AVX512F-NEXT: vpsrlq $21, %zmm0, %zmm2 3644 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3645 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3646 ; AVX512F-NEXT: vpsrlq $23, %zmm0, %zmm2 3647 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3648 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3649 ; AVX512F-NEXT: vpsrlq $25, %zmm0, %zmm2 3650 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3651 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3652 ; AVX512F-NEXT: vpsrlq $27, %zmm0, %zmm2 3653 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3654 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3655 ; AVX512F-NEXT: vpsrlq $29, %zmm0, %zmm2 3656 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3657 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3658 ; AVX512F-NEXT: vpsrlq $31, %zmm0, %zmm2 3659 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3660 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3661 ; AVX512F-NEXT: vpsrlq $33, %zmm0, %zmm2 3662 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3663 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3664 ; AVX512F-NEXT: vpsrlq $35, %zmm0, %zmm2 3665 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3666 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3667 ; AVX512F-NEXT: vpsrlq $37, %zmm0, %zmm2 3668 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3669 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3670 ; AVX512F-NEXT: vpsrlq $39, %zmm0, %zmm2 3671 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3672 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3673 ; AVX512F-NEXT: vpsrlq $41, %zmm0, %zmm2 3674 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3675 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3676 ; AVX512F-NEXT: vpsrlq $43, %zmm0, %zmm2 3677 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3678 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3679 ; AVX512F-NEXT: vpsrlq $45, %zmm0, %zmm2 3680 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3681 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3682 ; AVX512F-NEXT: vpsrlq $47, %zmm0, %zmm2 3683 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3684 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3685 ; AVX512F-NEXT: vpsrlq $49, %zmm0, %zmm2 3686 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3687 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3688 ; AVX512F-NEXT: vpsrlq $51, %zmm0, %zmm2 3689 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3690 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3691 ; AVX512F-NEXT: vpsrlq $53, %zmm0, %zmm2 3692 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3693 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3694 ; AVX512F-NEXT: vpsrlq $55, %zmm0, %zmm2 3695 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3696 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3697 ; AVX512F-NEXT: vpsrlq $57, %zmm0, %zmm2 3698 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3699 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3700 ; AVX512F-NEXT: vpsrlq $59, %zmm0, %zmm2 3701 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3702 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3703 ; AVX512F-NEXT: vpsrlq $61, %zmm0, %zmm2 3704 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3705 ; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1 3706 ; AVX512F-NEXT: vpsrlq $63, %zmm0, %zmm0 3707 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3708 ; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0 3709 ; AVX512F-NEXT: retq 3710 ; 3711 ; AVX512BW-LABEL: test_bitreverse_v8i64: 3712 ; AVX512BW: # BB#0: 3713 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] 3714 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 3715 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 3716 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] 3717 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 3718 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 3719 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 3720 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] 3721 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 3722 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 3723 ; AVX512BW-NEXT: retq 3724 ; 3725 ; XOPAVX1-LABEL: test_bitreverse_v8i64: 3726 ; XOPAVX1: # BB#0: 3727 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3728 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3729 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3730 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3731 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 3732 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 3733 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3734 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3735 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 3736 ; XOPAVX1-NEXT: retq 3737 ; 3738 ; XOPAVX2-LABEL: test_bitreverse_v8i64: 3739 ; XOPAVX2: # BB#0: 3740 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 3741 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88] 3742 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3743 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0 3744 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 3745 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3746 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2 3747 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1 3748 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 3749 ; XOPAVX2-NEXT: retq 3750 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) 3751 ret <8 x i64> %b 3752 } 3753 3754 declare i8 @llvm.bitreverse.i8(i8) readnone 3755 declare i16 @llvm.bitreverse.i16(i16) readnone 3756 declare i32 @llvm.bitreverse.i32(i32) readnone 3757 declare i64 @llvm.bitreverse.i64(i64) readnone 3758 3759 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone 3760 declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone 3761 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone 3762 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone 3763 3764 declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone 3765 declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone 3766 declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone 3767 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone 3768 3769 declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone 3770 declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone 3771 declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone 3772 declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone 3773