1 //===---------------------------------------------------------------------===// 2 // Random ideas for the X86 backend. 3 //===---------------------------------------------------------------------===// 4 5 Improvements to the multiply -> shift/add algorithm: 6 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html 7 8 //===---------------------------------------------------------------------===// 9 10 Improve code like this (occurs fairly frequently, e.g. in LLVM): 11 long long foo(int x) { return 1LL << x; } 12 13 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html 14 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html 15 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html 16 17 Another useful one would be ~0ULL >> X and ~0ULL << X. 18 19 One better solution for 1LL << x is: 20 xorl %eax, %eax 21 xorl %edx, %edx 22 testb $32, %cl 23 sete %al 24 setne %dl 25 sall %cl, %eax 26 sall %cl, %edx 27 28 But that requires good 8-bit subreg support. 29 30 Also, this might be better. It's an extra shift, but it's one instruction 31 shorter, and doesn't stress 8-bit subreg support. 32 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html, 33 but without the unnecessary and.) 34 movl %ecx, %eax 35 shrl $5, %eax 36 movl %eax, %edx 37 xorl $1, %edx 38 sall %cl, %eax 39 sall %cl. %edx 40 41 64-bit shifts (in general) expand to really bad code. Instead of using 42 cmovs, we should expand to a conditional branch like GCC produces. 43 44 //===---------------------------------------------------------------------===// 45 46 Some isel ideas: 47 48 1. Dynamic programming based approach when compile time is not an 49 issue. 50 2. Code duplication (addressing mode) during isel. 51 3. Other ideas from "Register-Sensitive Selection, Duplication, and 52 Sequencing of Instructions". 53 4. Scheduling for reduced register pressure. E.g. "Minimum Register 54 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 55 and other related papers. 56 http://citeseer.ist.psu.edu/govindarajan01minimum.html 57 58 //===---------------------------------------------------------------------===// 59 60 Should we promote i16 to i32 to avoid partial register update stalls? 61 62 //===---------------------------------------------------------------------===// 63 64 Leave any_extend as pseudo instruction and hint to register 65 allocator. Delay codegen until post register allocation. 66 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach 67 the coalescer how to deal with it though. 68 69 //===---------------------------------------------------------------------===// 70 71 It appears icc use push for parameter passing. Need to investigate. 72 73 //===---------------------------------------------------------------------===// 74 75 The instruction selector sometimes misses folding a load into a compare. The 76 pattern is written as (cmp reg, (load p)). Because the compare isn't 77 commutative, it is not matched with the load on both sides. The dag combiner 78 should be made smart enough to canonicalize the load into the RHS of a compare 79 when it can invert the result of the compare for free. 80 81 //===---------------------------------------------------------------------===// 82 83 In many cases, LLVM generates code like this: 84 85 _test: 86 movl 8(%esp), %eax 87 cmpl %eax, 4(%esp) 88 setl %al 89 movzbl %al, %eax 90 ret 91 92 on some processors (which ones?), it is more efficient to do this: 93 94 _test: 95 movl 8(%esp), %ebx 96 xor %eax, %eax 97 cmpl %ebx, 4(%esp) 98 setl %al 99 ret 100 101 Doing this correctly is tricky though, as the xor clobbers the flags. 102 103 //===---------------------------------------------------------------------===// 104 105 We should generate bts/btr/etc instructions on targets where they are cheap or 106 when codesize is important. e.g., for: 107 108 void setbit(int *target, int bit) { 109 *target |= (1 << bit); 110 } 111 void clearbit(int *target, int bit) { 112 *target &= ~(1 << bit); 113 } 114 115 //===---------------------------------------------------------------------===// 116 117 Instead of the following for memset char*, 1, 10: 118 119 movl $16843009, 4(%edx) 120 movl $16843009, (%edx) 121 movw $257, 8(%edx) 122 123 It might be better to generate 124 125 movl $16843009, %eax 126 movl %eax, 4(%edx) 127 movl %eax, (%edx) 128 movw al, 8(%edx) 129 130 when we can spare a register. It reduces code size. 131 132 //===---------------------------------------------------------------------===// 133 134 Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently 135 get this: 136 137 define i32 @test1(i32 %X) { 138 %Y = sdiv i32 %X, 8 139 ret i32 %Y 140 } 141 142 _test1: 143 movl 4(%esp), %eax 144 movl %eax, %ecx 145 sarl $31, %ecx 146 shrl $29, %ecx 147 addl %ecx, %eax 148 sarl $3, %eax 149 ret 150 151 GCC knows several different ways to codegen it, one of which is this: 152 153 _test1: 154 movl 4(%esp), %eax 155 cmpl $-1, %eax 156 leal 7(%eax), %ecx 157 cmovle %ecx, %eax 158 sarl $3, %eax 159 ret 160 161 which is probably slower, but it's interesting at least :) 162 163 //===---------------------------------------------------------------------===// 164 165 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl 166 We should leave these as libcalls for everything over a much lower threshold, 167 since libc is hand tuned for medium and large mem ops (avoiding RFO for large 168 stores, TLB preheating, etc) 169 170 //===---------------------------------------------------------------------===// 171 172 Optimize this into something reasonable: 173 x * copysign(1.0, y) * copysign(1.0, z) 174 175 //===---------------------------------------------------------------------===// 176 177 Optimize copysign(x, *y) to use an integer load from y. 178 179 //===---------------------------------------------------------------------===// 180 181 The following tests perform worse with LSR: 182 183 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor. 184 185 //===---------------------------------------------------------------------===// 186 187 Adding to the list of cmp / test poor codegen issues: 188 189 int test(__m128 *A, __m128 *B) { 190 if (_mm_comige_ss(*A, *B)) 191 return 3; 192 else 193 return 4; 194 } 195 196 _test: 197 movl 8(%esp), %eax 198 movaps (%eax), %xmm0 199 movl 4(%esp), %eax 200 movaps (%eax), %xmm1 201 comiss %xmm0, %xmm1 202 setae %al 203 movzbl %al, %ecx 204 movl $3, %eax 205 movl $4, %edx 206 cmpl $0, %ecx 207 cmove %edx, %eax 208 ret 209 210 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There 211 are a number of issues. 1) We are introducing a setcc between the result of the 212 intrisic call and select. 2) The intrinsic is expected to produce a i32 value 213 so a any extend (which becomes a zero extend) is added. 214 215 We probably need some kind of target DAG combine hook to fix this. 216 217 //===---------------------------------------------------------------------===// 218 219 We generate significantly worse code for this than GCC: 220 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150 221 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701 222 223 There is also one case we do worse on PPC. 224 225 //===---------------------------------------------------------------------===// 226 227 For this: 228 229 int test(int a) 230 { 231 return a * 3; 232 } 233 234 We currently emits 235 imull $3, 4(%esp), %eax 236 237 Perhaps this is what we really should generate is? Is imull three or four 238 cycles? Note: ICC generates this: 239 movl 4(%esp), %eax 240 leal (%eax,%eax,2), %eax 241 242 The current instruction priority is based on pattern complexity. The former is 243 more "complex" because it folds a load so the latter will not be emitted. 244 245 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We 246 should always try to match LEA first since the LEA matching code does some 247 estimate to determine whether the match is profitable. 248 249 However, if we care more about code size, then imull is better. It's two bytes 250 shorter than movl + leal. 251 252 On a Pentium M, both variants have the same characteristics with regard 253 to throughput; however, the multiplication has a latency of four cycles, as 254 opposed to two cycles for the movl+lea variant. 255 256 //===---------------------------------------------------------------------===// 257 258 It appears gcc place string data with linkonce linkage in 259 .section __TEXT,__const_coal,coalesced instead of 260 .section __DATA,__const_coal,coalesced. 261 Take a look at darwin.h, there are other Darwin assembler directives that we 262 do not make use of. 263 264 //===---------------------------------------------------------------------===// 265 266 define i32 @foo(i32* %a, i32 %t) { 267 entry: 268 br label %cond_true 269 270 cond_true: ; preds = %cond_true, %entry 271 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3] 272 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1] 273 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1] 274 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1] 275 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1] 276 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2] 277 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2] 278 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1] 279 br i1 %tmp, label %bb12, label %cond_true 280 281 bb12: ; preds = %cond_true 282 ret i32 %tmp7 283 } 284 is pessimized by -loop-reduce and -indvars 285 286 //===---------------------------------------------------------------------===// 287 288 u32 to float conversion improvement: 289 290 float uint32_2_float( unsigned u ) { 291 float fl = (int) (u & 0xffff); 292 float fh = (int) (u >> 16); 293 fh *= 0x1.0p16f; 294 return fh + fl; 295 } 296 297 00000000 subl $0x04,%esp 298 00000003 movl 0x08(%esp,1),%eax 299 00000007 movl %eax,%ecx 300 00000009 shrl $0x10,%ecx 301 0000000c cvtsi2ss %ecx,%xmm0 302 00000010 andl $0x0000ffff,%eax 303 00000015 cvtsi2ss %eax,%xmm1 304 00000019 mulss 0x00000078,%xmm0 305 00000021 addss %xmm1,%xmm0 306 00000025 movss %xmm0,(%esp,1) 307 0000002a flds (%esp,1) 308 0000002d addl $0x04,%esp 309 00000030 ret 310 311 //===---------------------------------------------------------------------===// 312 313 When using fastcc abi, align stack slot of argument of type double on 8 byte 314 boundary to improve performance. 315 316 //===---------------------------------------------------------------------===// 317 318 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting 319 simplifications for integer "x cmp y ? a : b". 320 321 //===---------------------------------------------------------------------===// 322 323 Consider the expansion of: 324 325 define i32 @test3(i32 %X) { 326 %tmp1 = urem i32 %X, 255 327 ret i32 %tmp1 328 } 329 330 Currently it compiles to: 331 332 ... 333 movl $2155905153, %ecx 334 movl 8(%esp), %esi 335 movl %esi, %eax 336 mull %ecx 337 ... 338 339 This could be "reassociated" into: 340 341 movl $2155905153, %eax 342 movl 8(%esp), %ecx 343 mull %ecx 344 345 to avoid the copy. In fact, the existing two-address stuff would do this 346 except that mul isn't a commutative 2-addr instruction. I guess this has 347 to be done at isel time based on the #uses to mul? 348 349 //===---------------------------------------------------------------------===// 350 351 Make sure the instruction which starts a loop does not cross a cacheline 352 boundary. This requires knowning the exact length of each machine instruction. 353 That is somewhat complicated, but doable. Example 256.bzip2: 354 355 In the new trace, the hot loop has an instruction which crosses a cacheline 356 boundary. In addition to potential cache misses, this can't help decoding as I 357 imagine there has to be some kind of complicated decoder reset and realignment 358 to grab the bytes from the next cacheline. 359 360 532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines 361 942 942 0x3d03 movl %dh, (1809(%esp, %esi) 362 937 937 0x3d0a incl %esi 363 3 3 0x3d0b cmpb %bl, %dl 364 27 27 0x3d0d jnz 0x000062db <main+11707> 365 366 //===---------------------------------------------------------------------===// 367 368 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE. 369 370 //===---------------------------------------------------------------------===// 371 372 This could be a single 16-bit load. 373 374 int f(char *p) { 375 if ((p[0] == 1) & (p[1] == 2)) return 1; 376 return 0; 377 } 378 379 //===---------------------------------------------------------------------===// 380 381 We should inline lrintf and probably other libc functions. 382 383 //===---------------------------------------------------------------------===// 384 385 This code: 386 387 void test(int X) { 388 if (X) abort(); 389 } 390 391 is currently compiled to: 392 393 _test: 394 subl $12, %esp 395 cmpl $0, 16(%esp) 396 jne LBB1_1 397 addl $12, %esp 398 ret 399 LBB1_1: 400 call L_abort$stub 401 402 It would be better to produce: 403 404 _test: 405 subl $12, %esp 406 cmpl $0, 16(%esp) 407 jne L_abort$stub 408 addl $12, %esp 409 ret 410 411 This can be applied to any no-return function call that takes no arguments etc. 412 Alternatively, the stack save/restore logic could be shrink-wrapped, producing 413 something like this: 414 415 _test: 416 cmpl $0, 4(%esp) 417 jne LBB1_1 418 ret 419 LBB1_1: 420 subl $12, %esp 421 call L_abort$stub 422 423 Both are useful in different situations. Finally, it could be shrink-wrapped 424 and tail called, like this: 425 426 _test: 427 cmpl $0, 4(%esp) 428 jne LBB1_1 429 ret 430 LBB1_1: 431 pop %eax # realign stack. 432 call L_abort$stub 433 434 Though this probably isn't worth it. 435 436 //===---------------------------------------------------------------------===// 437 438 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with 439 a neg instead of a sub instruction. Consider: 440 441 int test(char X) { return 7-X; } 442 443 we currently produce: 444 _test: 445 movl $7, %eax 446 movsbl 4(%esp), %ecx 447 subl %ecx, %eax 448 ret 449 450 We would use one fewer register if codegen'd as: 451 452 movsbl 4(%esp), %eax 453 neg %eax 454 add $7, %eax 455 ret 456 457 Note that this isn't beneficial if the load can be folded into the sub. In 458 this case, we want a sub: 459 460 int test(int X) { return 7-X; } 461 _test: 462 movl $7, %eax 463 subl 4(%esp), %eax 464 ret 465 466 //===---------------------------------------------------------------------===// 467 468 Leaf functions that require one 4-byte spill slot have a prolog like this: 469 470 _foo: 471 pushl %esi 472 subl $4, %esp 473 ... 474 and an epilog like this: 475 addl $4, %esp 476 popl %esi 477 ret 478 479 It would be smaller, and potentially faster, to push eax on entry and to 480 pop into a dummy register instead of using addl/subl of esp. Just don't pop 481 into any return registers :) 482 483 //===---------------------------------------------------------------------===// 484 485 The X86 backend should fold (branch (or (setcc, setcc))) into multiple 486 branches. We generate really poor code for: 487 488 double testf(double a) { 489 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0); 490 } 491 492 For example, the entry BB is: 493 494 _testf: 495 subl $20, %esp 496 pxor %xmm0, %xmm0 497 movsd 24(%esp), %xmm1 498 ucomisd %xmm0, %xmm1 499 setnp %al 500 sete %cl 501 testb %cl, %al 502 jne LBB1_5 # UnifiedReturnBlock 503 LBB1_1: # cond_true 504 505 506 it would be better to replace the last four instructions with: 507 508 jp LBB1_1 509 je LBB1_5 510 LBB1_1: 511 512 We also codegen the inner ?: into a diamond: 513 514 cvtss2sd LCPI1_0(%rip), %xmm2 515 cvtss2sd LCPI1_1(%rip), %xmm3 516 ucomisd %xmm1, %xmm0 517 ja LBB1_3 # cond_true 518 LBB1_2: # cond_true 519 movapd %xmm3, %xmm2 520 LBB1_3: # cond_true 521 movapd %xmm2, %xmm0 522 ret 523 524 We should sink the load into xmm3 into the LBB1_2 block. This should 525 be pretty easy, and will nuke all the copies. 526 527 //===---------------------------------------------------------------------===// 528 529 This: 530 #include <algorithm> 531 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b) 532 { return std::make_pair(a + b, a + b < a); } 533 bool no_overflow(unsigned a, unsigned b) 534 { return !full_add(a, b).second; } 535 536 Should compile to: 537 addl %esi, %edi 538 setae %al 539 movzbl %al, %eax 540 ret 541 542 on x86-64, instead of the rather stupid-looking: 543 addl %esi, %edi 544 setb %al 545 xorb $1, %al 546 movzbl %al, %eax 547 ret 548 549 550 //===---------------------------------------------------------------------===// 551 552 The following code: 553 554 bb114.preheader: ; preds = %cond_next94 555 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1] 556 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1] 557 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1] 558 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1] 559 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1] 560 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2] 561 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1] 562 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1] 563 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1] 564 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1] 565 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1] 566 br label %bb114 567 568 produces: 569 570 LBB3_5: # bb114.preheader 571 movswl -68(%ebp), %eax 572 movl $32, %ecx 573 movl %ecx, -80(%ebp) 574 subl %eax, -80(%ebp) 575 movswl -52(%ebp), %eax 576 movl %ecx, -84(%ebp) 577 subl %eax, -84(%ebp) 578 movswl -70(%ebp), %eax 579 movl %ecx, -88(%ebp) 580 subl %eax, -88(%ebp) 581 movswl -50(%ebp), %eax 582 subl %eax, %ecx 583 movl %ecx, -76(%ebp) 584 movswl -42(%ebp), %eax 585 movl %eax, -92(%ebp) 586 movswl -66(%ebp), %eax 587 movl %eax, -96(%ebp) 588 movw $0, -98(%ebp) 589 590 This appears to be bad because the RA is not folding the store to the stack 591 slot into the movl. The above instructions could be: 592 movl $32, -80(%ebp) 593 ... 594 movl $32, -84(%ebp) 595 ... 596 This seems like a cross between remat and spill folding. 597 598 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't 599 change, so we could simply subtract %eax from %ecx first and then use %ecx (or 600 vice-versa). 601 602 //===---------------------------------------------------------------------===// 603 604 This code: 605 606 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1] 607 br i1 %tmp659, label %cond_true662, label %cond_next715 608 609 produces this: 610 611 testw %cx, %cx 612 movswl %cx, %esi 613 jns LBB4_109 # cond_next715 614 615 Shark tells us that using %cx in the testw instruction is sub-optimal. It 616 suggests using the 32-bit register (which is what ICC uses). 617 618 //===---------------------------------------------------------------------===// 619 620 We compile this: 621 622 void compare (long long foo) { 623 if (foo < 4294967297LL) 624 abort(); 625 } 626 627 to: 628 629 compare: 630 subl $4, %esp 631 cmpl $0, 8(%esp) 632 setne %al 633 movzbw %al, %ax 634 cmpl $1, 12(%esp) 635 setg %cl 636 movzbw %cl, %cx 637 cmove %ax, %cx 638 testb $1, %cl 639 jne .LBB1_2 # UnifiedReturnBlock 640 .LBB1_1: # ifthen 641 call abort 642 .LBB1_2: # UnifiedReturnBlock 643 addl $4, %esp 644 ret 645 646 (also really horrible code on ppc). This is due to the expand code for 64-bit 647 compares. GCC produces multiple branches, which is much nicer: 648 649 compare: 650 subl $12, %esp 651 movl 20(%esp), %edx 652 movl 16(%esp), %eax 653 decl %edx 654 jle .L7 655 .L5: 656 addl $12, %esp 657 ret 658 .p2align 4,,7 659 .L7: 660 jl .L4 661 cmpl $0, %eax 662 .p2align 4,,8 663 ja .L5 664 .L4: 665 .p2align 4,,9 666 call abort 667 668 //===---------------------------------------------------------------------===// 669 670 Tail call optimization improvements: Tail call optimization currently 671 pushes all arguments on the top of the stack (their normal place for 672 non-tail call optimized calls) that source from the callers arguments 673 or that source from a virtual register (also possibly sourcing from 674 callers arguments). 675 This is done to prevent overwriting of parameters (see example 676 below) that might be used later. 677 678 example: 679 680 int callee(int32, int64); 681 int caller(int32 arg1, int32 arg2) { 682 int64 local = arg2 * 2; 683 return callee(arg2, (int64)local); 684 } 685 686 [arg1] [!arg2 no longer valid since we moved local onto it] 687 [arg2] -> [(int64) 688 [RETADDR] local ] 689 690 Moving arg1 onto the stack slot of callee function would overwrite 691 arg2 of the caller. 692 693 Possible optimizations: 694 695 696 - Analyse the actual parameters of the callee to see which would 697 overwrite a caller parameter which is used by the callee and only 698 push them onto the top of the stack. 699 700 int callee (int32 arg1, int32 arg2); 701 int caller (int32 arg1, int32 arg2) { 702 return callee(arg1,arg2); 703 } 704 705 Here we don't need to write any variables to the top of the stack 706 since they don't overwrite each other. 707 708 int callee (int32 arg1, int32 arg2); 709 int caller (int32 arg1, int32 arg2) { 710 return callee(arg2,arg1); 711 } 712 713 Here we need to push the arguments because they overwrite each 714 other. 715 716 //===---------------------------------------------------------------------===// 717 718 main () 719 { 720 int i = 0; 721 unsigned long int z = 0; 722 723 do { 724 z -= 0x00004000; 725 i++; 726 if (i > 0x00040000) 727 abort (); 728 } while (z > 0); 729 exit (0); 730 } 731 732 gcc compiles this to: 733 734 _main: 735 subl $28, %esp 736 xorl %eax, %eax 737 jmp L2 738 L3: 739 cmpl $262144, %eax 740 je L10 741 L2: 742 addl $1, %eax 743 cmpl $262145, %eax 744 jne L3 745 call L_abort$stub 746 L10: 747 movl $0, (%esp) 748 call L_exit$stub 749 750 llvm: 751 752 _main: 753 subl $12, %esp 754 movl $1, %eax 755 movl $16384, %ecx 756 LBB1_1: # bb 757 cmpl $262145, %eax 758 jge LBB1_4 # cond_true 759 LBB1_2: # cond_next 760 incl %eax 761 addl $4294950912, %ecx 762 cmpl $16384, %ecx 763 jne LBB1_1 # bb 764 LBB1_3: # bb11 765 xorl %eax, %eax 766 addl $12, %esp 767 ret 768 LBB1_4: # cond_true 769 call L_abort$stub 770 771 1. LSR should rewrite the first cmp with induction variable %ecx. 772 2. DAG combiner should fold 773 leal 1(%eax), %edx 774 cmpl $262145, %edx 775 => 776 cmpl $262144, %eax 777 778 //===---------------------------------------------------------------------===// 779 780 define i64 @test(double %X) { 781 %Y = fptosi double %X to i64 782 ret i64 %Y 783 } 784 785 compiles to: 786 787 _test: 788 subl $20, %esp 789 movsd 24(%esp), %xmm0 790 movsd %xmm0, 8(%esp) 791 fldl 8(%esp) 792 fisttpll (%esp) 793 movl 4(%esp), %edx 794 movl (%esp), %eax 795 addl $20, %esp 796 #FP_REG_KILL 797 ret 798 799 This should just fldl directly from the input stack slot. 800 801 //===---------------------------------------------------------------------===// 802 803 This code: 804 int foo (int x) { return (x & 65535) | 255; } 805 806 Should compile into: 807 808 _foo: 809 movzwl 4(%esp), %eax 810 orl $255, %eax 811 ret 812 813 instead of: 814 _foo: 815 movl $65280, %eax 816 andl 4(%esp), %eax 817 orl $255, %eax 818 ret 819 820 //===---------------------------------------------------------------------===// 821 822 We're codegen'ing multiply of long longs inefficiently: 823 824 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) { 825 return arg1 * arg2; 826 } 827 828 We compile to (fomit-frame-pointer): 829 830 _LLM: 831 pushl %esi 832 movl 8(%esp), %ecx 833 movl 16(%esp), %esi 834 movl %esi, %eax 835 mull %ecx 836 imull 12(%esp), %esi 837 addl %edx, %esi 838 imull 20(%esp), %ecx 839 movl %esi, %edx 840 addl %ecx, %edx 841 popl %esi 842 ret 843 844 This looks like a scheduling deficiency and lack of remat of the load from 845 the argument area. ICC apparently produces: 846 847 movl 8(%esp), %ecx 848 imull 12(%esp), %ecx 849 movl 16(%esp), %eax 850 imull 4(%esp), %eax 851 addl %eax, %ecx 852 movl 4(%esp), %eax 853 mull 12(%esp) 854 addl %ecx, %edx 855 ret 856 857 Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR: 858 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236 859 860 //===---------------------------------------------------------------------===// 861 862 We can fold a store into "zeroing a reg". Instead of: 863 864 xorl %eax, %eax 865 movl %eax, 124(%esp) 866 867 we should get: 868 869 movl $0, 124(%esp) 870 871 if the flags of the xor are dead. 872 873 Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should 874 be folded into: shl [mem], 1 875 876 //===---------------------------------------------------------------------===// 877 878 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor 879 or and instruction, for example: 880 881 xorpd LCPI1_0, %xmm2 882 883 However, if xmm2 gets spilled, we end up with really ugly code like this: 884 885 movsd (%esp), %xmm0 886 xorpd LCPI1_0, %xmm0 887 movsd %xmm0, (%esp) 888 889 Since we 'know' that this is a 'neg', we can actually "fold" the spill into 890 the neg/abs instruction, turning it into an *integer* operation, like this: 891 892 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31) 893 894 you could also use xorb, but xorl is less likely to lead to a partial register 895 stall. Here is a contrived testcase: 896 897 double a, b, c; 898 void test(double *P) { 899 double X = *P; 900 a = X; 901 bar(); 902 X = -X; 903 b = X; 904 bar(); 905 c = X; 906 } 907 908 //===---------------------------------------------------------------------===// 909 910 The generated code on x86 for checking for signed overflow on a multiply the 911 obvious way is much longer than it needs to be. 912 913 int x(int a, int b) { 914 long long prod = (long long)a*b; 915 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1); 916 } 917 918 See PR2053 for more details. 919 920 //===---------------------------------------------------------------------===// 921 922 We should investigate using cdq/ctld (effect: edx = sar eax, 31) 923 more aggressively; it should cost the same as a move+shift on any modern 924 processor, but it's a lot shorter. Downside is that it puts more 925 pressure on register allocation because it has fixed operands. 926 927 Example: 928 int abs(int x) {return x < 0 ? -x : x;} 929 930 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.: 931 abs: 932 movl 4(%esp), %eax 933 cltd 934 xorl %edx, %eax 935 subl %edx, %eax 936 ret 937 938 //===---------------------------------------------------------------------===// 939 940 Take the following code (from 941 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541): 942 943 extern unsigned char first_one[65536]; 944 int FirstOnet(unsigned long long arg1) 945 { 946 if (arg1 >> 48) 947 return (first_one[arg1 >> 48]); 948 return 0; 949 } 950 951 952 The following code is currently generated: 953 FirstOnet: 954 movl 8(%esp), %eax 955 cmpl $65536, %eax 956 movl 4(%esp), %ecx 957 jb .LBB1_2 # UnifiedReturnBlock 958 .LBB1_1: # ifthen 959 shrl $16, %eax 960 movzbl first_one(%eax), %eax 961 ret 962 .LBB1_2: # UnifiedReturnBlock 963 xorl %eax, %eax 964 ret 965 966 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this 967 lets us change the cmpl into a testl, which is shorter, and eliminate the shift. 968 969 //===---------------------------------------------------------------------===// 970 971 We compile this function: 972 973 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind { 974 entry: 975 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1] 976 br i1 %tmp2, label %bb7, label %bb 977 978 bb: ; preds = %entry 979 %tmp6 = add i32 %b, %a ; <i32> [#uses=1] 980 ret i32 %tmp6 981 982 bb7: ; preds = %entry 983 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1] 984 ret i32 %tmp10 985 } 986 987 to: 988 989 foo: # @foo 990 # BB#0: # %entry 991 movl 4(%esp), %ecx 992 cmpb $0, 16(%esp) 993 je .LBB0_2 994 # BB#1: # %bb 995 movl 8(%esp), %eax 996 addl %ecx, %eax 997 ret 998 .LBB0_2: # %bb7 999 movl 12(%esp), %edx 1000 movl %ecx, %eax 1001 subl %edx, %eax 1002 ret 1003 1004 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a 1005 couple more movls by putting 4(%esp) into %eax instead of %ecx. 1006 1007 //===---------------------------------------------------------------------===// 1008 1009 See rdar://4653682. 1010 1011 From flops: 1012 1013 LBB1_15: # bb310 1014 cvtss2sd LCPI1_0, %xmm1 1015 addsd %xmm1, %xmm0 1016 movsd 176(%esp), %xmm2 1017 mulsd %xmm0, %xmm2 1018 movapd %xmm2, %xmm3 1019 mulsd %xmm3, %xmm3 1020 movapd %xmm3, %xmm4 1021 mulsd LCPI1_23, %xmm4 1022 addsd LCPI1_24, %xmm4 1023 mulsd %xmm3, %xmm4 1024 addsd LCPI1_25, %xmm4 1025 mulsd %xmm3, %xmm4 1026 addsd LCPI1_26, %xmm4 1027 mulsd %xmm3, %xmm4 1028 addsd LCPI1_27, %xmm4 1029 mulsd %xmm3, %xmm4 1030 addsd LCPI1_28, %xmm4 1031 mulsd %xmm3, %xmm4 1032 addsd %xmm1, %xmm4 1033 mulsd %xmm2, %xmm4 1034 movsd 152(%esp), %xmm1 1035 addsd %xmm4, %xmm1 1036 movsd %xmm1, 152(%esp) 1037 incl %eax 1038 cmpl %eax, %esi 1039 jge LBB1_15 # bb310 1040 LBB1_16: # bb358.loopexit 1041 movsd 152(%esp), %xmm0 1042 addsd %xmm0, %xmm0 1043 addsd LCPI1_22, %xmm0 1044 movsd %xmm0, 152(%esp) 1045 1046 Rather than spilling the result of the last addsd in the loop, we should have 1047 insert a copy to split the interval (one for the duration of the loop, one 1048 extending to the fall through). The register pressure in the loop isn't high 1049 enough to warrant the spill. 1050 1051 Also check why xmm7 is not used at all in the function. 1052 1053 //===---------------------------------------------------------------------===// 1054 1055 Take the following: 1056 1057 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128" 1058 target triple = "i386-apple-darwin8" 1059 @in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2] 1060 define fastcc void @abort_gzip() noreturn nounwind { 1061 entry: 1062 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1] 1063 br i1 %tmp.b.i, label %bb.i, label %bb4.i 1064 bb.i: ; preds = %entry 1065 tail call void @exit( i32 1 ) noreturn nounwind 1066 unreachable 1067 bb4.i: ; preds = %entry 1068 store i1 true, i1* @in_exit.4870.b 1069 tail call void @exit( i32 1 ) noreturn nounwind 1070 unreachable 1071 } 1072 declare void @exit(i32) noreturn nounwind 1073 1074 This compiles into: 1075 _abort_gzip: ## @abort_gzip 1076 ## BB#0: ## %entry 1077 subl $12, %esp 1078 movb _in_exit.4870.b, %al 1079 cmpb $1, %al 1080 jne LBB0_2 1081 1082 We somehow miss folding the movb into the cmpb. 1083 1084 //===---------------------------------------------------------------------===// 1085 1086 We compile: 1087 1088 int test(int x, int y) { 1089 return x-y-1; 1090 } 1091 1092 into (-m64): 1093 1094 _test: 1095 decl %edi 1096 movl %edi, %eax 1097 subl %esi, %eax 1098 ret 1099 1100 it would be better to codegen as: x+~y (notl+addl) 1101 1102 //===---------------------------------------------------------------------===// 1103 1104 This code: 1105 1106 int foo(const char *str,...) 1107 { 1108 __builtin_va_list a; int x; 1109 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a); 1110 return x; 1111 } 1112 1113 gets compiled into this on x86-64: 1114 subq $200, %rsp 1115 movaps %xmm7, 160(%rsp) 1116 movaps %xmm6, 144(%rsp) 1117 movaps %xmm5, 128(%rsp) 1118 movaps %xmm4, 112(%rsp) 1119 movaps %xmm3, 96(%rsp) 1120 movaps %xmm2, 80(%rsp) 1121 movaps %xmm1, 64(%rsp) 1122 movaps %xmm0, 48(%rsp) 1123 movq %r9, 40(%rsp) 1124 movq %r8, 32(%rsp) 1125 movq %rcx, 24(%rsp) 1126 movq %rdx, 16(%rsp) 1127 movq %rsi, 8(%rsp) 1128 leaq (%rsp), %rax 1129 movq %rax, 192(%rsp) 1130 leaq 208(%rsp), %rax 1131 movq %rax, 184(%rsp) 1132 movl $48, 180(%rsp) 1133 movl $8, 176(%rsp) 1134 movl 176(%rsp), %eax 1135 cmpl $47, %eax 1136 jbe .LBB1_3 # bb 1137 .LBB1_1: # bb3 1138 movq 184(%rsp), %rcx 1139 leaq 8(%rcx), %rax 1140 movq %rax, 184(%rsp) 1141 .LBB1_2: # bb4 1142 movl (%rcx), %eax 1143 addq $200, %rsp 1144 ret 1145 .LBB1_3: # bb 1146 movl %eax, %ecx 1147 addl $8, %eax 1148 addq 192(%rsp), %rcx 1149 movl %eax, 176(%rsp) 1150 jmp .LBB1_2 # bb4 1151 1152 gcc 4.3 generates: 1153 subq $96, %rsp 1154 .LCFI0: 1155 leaq 104(%rsp), %rax 1156 movq %rsi, -80(%rsp) 1157 movl $8, -120(%rsp) 1158 movq %rax, -112(%rsp) 1159 leaq -88(%rsp), %rax 1160 movq %rax, -104(%rsp) 1161 movl $8, %eax 1162 cmpl $48, %eax 1163 jb .L6 1164 movq -112(%rsp), %rdx 1165 movl (%rdx), %eax 1166 addq $96, %rsp 1167 ret 1168 .p2align 4,,10 1169 .p2align 3 1170 .L6: 1171 mov %eax, %edx 1172 addq -104(%rsp), %rdx 1173 addl $8, %eax 1174 movl %eax, -120(%rsp) 1175 movl (%rdx), %eax 1176 addq $96, %rsp 1177 ret 1178 1179 and it gets compiled into this on x86: 1180 pushl %ebp 1181 movl %esp, %ebp 1182 subl $4, %esp 1183 leal 12(%ebp), %eax 1184 movl %eax, -4(%ebp) 1185 leal 16(%ebp), %eax 1186 movl %eax, -4(%ebp) 1187 movl 12(%ebp), %eax 1188 addl $4, %esp 1189 popl %ebp 1190 ret 1191 1192 gcc 4.3 generates: 1193 pushl %ebp 1194 movl %esp, %ebp 1195 movl 12(%ebp), %eax 1196 popl %ebp 1197 ret 1198 1199 //===---------------------------------------------------------------------===// 1200 1201 Teach tblgen not to check bitconvert source type in some cases. This allows us 1202 to consolidate the following patterns in X86InstrMMX.td: 1203 1204 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1205 (iPTR 0))))), 1206 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>; 1207 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1208 (iPTR 0))))), 1209 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>; 1210 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src), 1211 (iPTR 0))))), 1212 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>; 1213 1214 There are other cases in various td files. 1215 1216 //===---------------------------------------------------------------------===// 1217 1218 Take something like the following on x86-32: 1219 unsigned a(unsigned long long x, unsigned y) {return x % y;} 1220 1221 We currently generate a libcall, but we really shouldn't: the expansion is 1222 shorter and likely faster than the libcall. The expected code is something 1223 like the following: 1224 1225 movl 12(%ebp), %eax 1226 movl 16(%ebp), %ecx 1227 xorl %edx, %edx 1228 divl %ecx 1229 movl 8(%ebp), %eax 1230 divl %ecx 1231 movl %edx, %eax 1232 ret 1233 1234 A similar code sequence works for division. 1235 1236 //===---------------------------------------------------------------------===// 1237 1238 We currently compile this: 1239 1240 define i32 @func1(i32 %v1, i32 %v2) nounwind { 1241 entry: 1242 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) 1243 %sum = extractvalue {i32, i1} %t, 0 1244 %obit = extractvalue {i32, i1} %t, 1 1245 br i1 %obit, label %overflow, label %normal 1246 normal: 1247 ret i32 %sum 1248 overflow: 1249 call void @llvm.trap() 1250 unreachable 1251 } 1252 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) 1253 declare void @llvm.trap() 1254 1255 to: 1256 1257 _func1: 1258 movl 4(%esp), %eax 1259 addl 8(%esp), %eax 1260 jo LBB1_2 ## overflow 1261 LBB1_1: ## normal 1262 ret 1263 LBB1_2: ## overflow 1264 ud2 1265 1266 it would be nice to produce "into" someday. 1267 1268 //===---------------------------------------------------------------------===// 1269 1270 Test instructions can be eliminated by using EFLAGS values from arithmetic 1271 instructions. This is currently not done for mul, and, or, xor, neg, shl, 1272 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done 1273 for read-modify-write instructions. It is also current not done if the 1274 OF or CF flags are needed. 1275 1276 The shift operators have the complication that when the shift count is 1277 zero, EFLAGS is not set, so they can only subsume a test instruction if 1278 the shift count is known to be non-zero. Also, using the EFLAGS value 1279 from a shift is apparently very slow on some x86 implementations. 1280 1281 In read-modify-write instructions, the root node in the isel match is 1282 the store, and isel has no way for the use of the EFLAGS result of the 1283 arithmetic to be remapped to the new node. 1284 1285 Add and subtract instructions set OF on signed overflow and CF on unsiged 1286 overflow, while test instructions always clear OF and CF. In order to 1287 replace a test with an add or subtract in a situation where OF or CF is 1288 needed, codegen must be able to prove that the operation cannot see 1289 signed or unsigned overflow, respectively. 1290 1291 //===---------------------------------------------------------------------===// 1292 1293 memcpy/memmove do not lower to SSE copies when possible. A silly example is: 1294 define <16 x float> @foo(<16 x float> %A) nounwind { 1295 %tmp = alloca <16 x float>, align 16 1296 %tmp2 = alloca <16 x float>, align 16 1297 store <16 x float> %A, <16 x float>* %tmp 1298 %s = bitcast <16 x float>* %tmp to i8* 1299 %s2 = bitcast <16 x float>* %tmp2 to i8* 1300 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16) 1301 %R = load <16 x float>* %tmp2 1302 ret <16 x float> %R 1303 } 1304 1305 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind 1306 1307 which compiles to: 1308 1309 _foo: 1310 subl $140, %esp 1311 movaps %xmm3, 112(%esp) 1312 movaps %xmm2, 96(%esp) 1313 movaps %xmm1, 80(%esp) 1314 movaps %xmm0, 64(%esp) 1315 movl 60(%esp), %eax 1316 movl %eax, 124(%esp) 1317 movl 56(%esp), %eax 1318 movl %eax, 120(%esp) 1319 movl 52(%esp), %eax 1320 <many many more 32-bit copies> 1321 movaps (%esp), %xmm0 1322 movaps 16(%esp), %xmm1 1323 movaps 32(%esp), %xmm2 1324 movaps 48(%esp), %xmm3 1325 addl $140, %esp 1326 ret 1327 1328 On Nehalem, it may even be cheaper to just use movups when unaligned than to 1329 fall back to lower-granularity chunks. 1330 1331 //===---------------------------------------------------------------------===// 1332 1333 Implement processor-specific optimizations for parity with GCC on these 1334 processors. GCC does two optimizations: 1335 1336 1. ix86_pad_returns inserts a noop before ret instructions if immediately 1337 preceded by a conditional branch or is the target of a jump. 1338 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of 1339 code contains more than 3 branches. 1340 1341 The first one is done for all AMDs, Core2, and "Generic" 1342 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, 1343 Core 2, and "Generic" 1344 1345 //===---------------------------------------------------------------------===// 1346 Testcase: 1347 int x(int a) { return (a&0xf0)>>4; } 1348 1349 Current output: 1350 movl 4(%esp), %eax 1351 shrl $4, %eax 1352 andl $15, %eax 1353 ret 1354 1355 Ideal output: 1356 movzbl 4(%esp), %eax 1357 shrl $4, %eax 1358 ret 1359 1360 //===---------------------------------------------------------------------===// 1361 1362 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch 1363 properly. 1364 1365 When the return value is not used (i.e. only care about the value in the 1366 memory), x86 does not have to use add to implement these. Instead, it can use 1367 add, sub, inc, dec instructions with the "lock" prefix. 1368 1369 This is currently implemented using a bit of instruction selection trick. The 1370 issue is the target independent pattern produces one output and a chain and we 1371 want to map it into one that just output a chain. The current trick is to select 1372 it into a MERGE_VALUES with the first definition being an implicit_def. The 1373 proper solution is to add new ISD opcodes for the no-output variant. DAG 1374 combiner can then transform the node before it gets to target node selection. 1375 1376 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in 1377 fact these instructions are identical to the non-lock versions. We need a way to 1378 add target specific information to target nodes and have this information 1379 carried over to machine instructions. Asm printer (or JIT) can use this 1380 information to add the "lock" prefix. 1381 1382 //===---------------------------------------------------------------------===// 1383 1384 struct B { 1385 unsigned char y0 : 1; 1386 }; 1387 1388 int bar(struct B* a) { return a->y0; } 1389 1390 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { 1391 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0 1392 %2 = load i8* %1, align 1 1393 %3 = and i8 %2, 1 1394 %4 = zext i8 %3 to i32 1395 ret i32 %4 1396 } 1397 1398 bar: # @bar 1399 # BB#0: 1400 movb (%rdi), %al 1401 andb $1, %al 1402 movzbl %al, %eax 1403 ret 1404 1405 Missed optimization: should be movl+andl. 1406 1407 //===---------------------------------------------------------------------===// 1408 1409 The x86_64 abi says: 1410 1411 Booleans, when stored in a memory object, are stored as single byte objects the 1412 value of which is always 0 (false) or 1 (true). 1413 1414 We are not using this fact: 1415 1416 int bar(_Bool *a) { return *a; } 1417 1418 define i32 @bar(i8* nocapture %a) nounwind readonly optsize { 1419 %1 = load i8* %a, align 1, !tbaa !0 1420 %tmp = and i8 %1, 1 1421 %2 = zext i8 %tmp to i32 1422 ret i32 %2 1423 } 1424 1425 bar: 1426 movb (%rdi), %al 1427 andb $1, %al 1428 movzbl %al, %eax 1429 ret 1430 1431 GCC produces 1432 1433 bar: 1434 movzbl (%rdi), %eax 1435 ret 1436 1437 //===---------------------------------------------------------------------===// 1438 1439 Consider the following two functions compiled with clang: 1440 _Bool foo(int *x) { return !(*x & 4); } 1441 unsigned bar(int *x) { return !(*x & 4); } 1442 1443 foo: 1444 movl 4(%esp), %eax 1445 testb $4, (%eax) 1446 sete %al 1447 movzbl %al, %eax 1448 ret 1449 1450 bar: 1451 movl 4(%esp), %eax 1452 movl (%eax), %eax 1453 shrl $2, %eax 1454 andl $1, %eax 1455 xorl $1, %eax 1456 ret 1457 1458 The second function generates more code even though the two functions are 1459 are functionally identical. 1460 1461 //===---------------------------------------------------------------------===// 1462 1463 Take the following C code: 1464 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; } 1465 1466 We generate the following IR with clang: 1467 define i32 @f(i32 %a, i32 %b) nounwind readnone { 1468 entry: 1469 %tmp = xor i32 %b, %a ; <i32> [#uses=1] 1470 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1] 1471 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1] 1472 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1473 ret i32 %conv5 1474 } 1475 1476 And the following x86 code: 1477 xorl %esi, %edi 1478 testb $-1, %dil 1479 sete %al 1480 movzbl %al, %eax 1481 ret 1482 1483 A cmpb instead of the xorl+testb would be one instruction shorter. 1484 1485 //===---------------------------------------------------------------------===// 1486 1487 Given the following C code: 1488 int f(int a, int b) { return (signed char)a == (signed char)b; } 1489 1490 We generate the following IR with clang: 1491 define i32 @f(i32 %a, i32 %b) nounwind readnone { 1492 entry: 1493 %sext = shl i32 %a, 24 ; <i32> [#uses=1] 1494 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1] 1495 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1] 1496 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1] 1497 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1] 1498 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1] 1499 ret i32 %conv5 1500 } 1501 1502 And the following x86 code: 1503 movsbl %sil, %eax 1504 movsbl %dil, %ecx 1505 cmpl %eax, %ecx 1506 sete %al 1507 movzbl %al, %eax 1508 ret 1509 1510 1511 It should be possible to eliminate the sign extensions. 1512 1513 //===---------------------------------------------------------------------===// 1514 1515 LLVM misses a load+store narrowing opportunity in this code: 1516 1517 %struct.bf = type { i64, i16, i16, i32 } 1518 1519 @bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] 1520 1521 define void @t1() nounwind ssp { 1522 entry: 1523 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1524 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1] 1525 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2] 1526 %3 = load i32* %2, align 1 ; <i32> [#uses=1] 1527 %4 = and i32 %3, -65537 ; <i32> [#uses=1] 1528 store i32 %4, i32* %2, align 1 1529 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] 1530 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1] 1531 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2] 1532 %8 = load i32* %7, align 1 ; <i32> [#uses=1] 1533 %9 = and i32 %8, -131073 ; <i32> [#uses=1] 1534 store i32 %9, i32* %7, align 1 1535 ret void 1536 } 1537 1538 LLVM currently emits this: 1539 1540 movq bfi(%rip), %rax 1541 andl $-65537, 8(%rax) 1542 movq bfi(%rip), %rax 1543 andl $-131073, 8(%rax) 1544 ret 1545 1546 It could narrow the loads and stores to emit this: 1547 1548 movq bfi(%rip), %rax 1549 andb $-2, 10(%rax) 1550 movq bfi(%rip), %rax 1551 andb $-3, 10(%rax) 1552 ret 1553 1554 The trouble is that there is a TokenFactor between the store and the 1555 load, making it non-trivial to determine if there's anything between 1556 the load and the store which would prohibit narrowing. 1557 1558 //===---------------------------------------------------------------------===// 1559 1560 This code: 1561 void foo(unsigned x) { 1562 if (x == 0) bar(); 1563 else if (x == 1) qux(); 1564 } 1565 1566 currently compiles into: 1567 _foo: 1568 movl 4(%esp), %eax 1569 cmpl $1, %eax 1570 je LBB0_3 1571 testl %eax, %eax 1572 jne LBB0_4 1573 1574 the testl could be removed: 1575 _foo: 1576 movl 4(%esp), %eax 1577 cmpl $1, %eax 1578 je LBB0_3 1579 jb LBB0_4 1580 1581 0 is the only unsigned number < 1. 1582 1583 //===---------------------------------------------------------------------===// 1584 1585 This code: 1586 1587 %0 = type { i32, i1 } 1588 1589 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp { 1590 entry: 1591 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x) 1592 %cmp = extractvalue %0 %uadd, 1 1593 %inc = zext i1 %cmp to i32 1594 %add = add i32 %x, %sum 1595 %z.0 = add i32 %add, %inc 1596 ret i32 %z.0 1597 } 1598 1599 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone 1600 1601 compiles to: 1602 1603 _add32carry: ## @add32carry 1604 addl %esi, %edi 1605 sbbl %ecx, %ecx 1606 movl %edi, %eax 1607 subl %ecx, %eax 1608 ret 1609 1610 But it could be: 1611 1612 _add32carry: 1613 leal (%rsi,%rdi), %eax 1614 cmpl %esi, %eax 1615 adcl $0, %eax 1616 ret 1617 1618 //===---------------------------------------------------------------------===// 1619 1620 The hot loop of 256.bzip2 contains code that looks a bit like this: 1621 1622 int foo(char *P, char *Q, int x, int y) { 1623 if (P[0] != Q[0]) 1624 return P[0] < Q[0]; 1625 if (P[1] != Q[1]) 1626 return P[1] < Q[1]; 1627 if (P[2] != Q[2]) 1628 return P[2] < Q[2]; 1629 return P[3] < Q[3]; 1630 } 1631 1632 In the real code, we get a lot more wrong than this. However, even in this 1633 code we generate: 1634 1635 _foo: ## @foo 1636 ## BB#0: ## %entry 1637 movb (%rsi), %al 1638 movb (%rdi), %cl 1639 cmpb %al, %cl 1640 je LBB0_2 1641 LBB0_1: ## %if.then 1642 cmpb %al, %cl 1643 jmp LBB0_5 1644 LBB0_2: ## %if.end 1645 movb 1(%rsi), %al 1646 movb 1(%rdi), %cl 1647 cmpb %al, %cl 1648 jne LBB0_1 1649 ## BB#3: ## %if.end38 1650 movb 2(%rsi), %al 1651 movb 2(%rdi), %cl 1652 cmpb %al, %cl 1653 jne LBB0_1 1654 ## BB#4: ## %if.end60 1655 movb 3(%rdi), %al 1656 cmpb 3(%rsi), %al 1657 LBB0_5: ## %if.end60 1658 setl %al 1659 movzbl %al, %eax 1660 ret 1661 1662 Note that we generate jumps to LBB0_1 which does a redundant compare. The 1663 redundant compare also forces the register values to be live, which prevents 1664 folding one of the loads into the compare. In contrast, GCC 4.2 produces: 1665 1666 _foo: 1667 movzbl (%rsi), %eax 1668 cmpb %al, (%rdi) 1669 jne L10 1670 L12: 1671 movzbl 1(%rsi), %eax 1672 cmpb %al, 1(%rdi) 1673 jne L10 1674 movzbl 2(%rsi), %eax 1675 cmpb %al, 2(%rdi) 1676 jne L10 1677 movzbl 3(%rdi), %eax 1678 cmpb 3(%rsi), %al 1679 L10: 1680 setl %al 1681 movzbl %al, %eax 1682 ret 1683 1684 which is "perfect". 1685 1686 //===---------------------------------------------------------------------===// 1687 1688 For the branch in the following code: 1689 int a(); 1690 int b(int x, int y) { 1691 if (x & (1<<(y&7))) 1692 return a(); 1693 return y; 1694 } 1695 1696 We currently generate: 1697 movb %sil, %al 1698 andb $7, %al 1699 movzbl %al, %eax 1700 btl %eax, %edi 1701 jae .LBB0_2 1702 1703 movl+andl would be shorter than the movb+andb+movzbl sequence. 1704 1705 //===---------------------------------------------------------------------===// 1706 1707 For the following: 1708 struct u1 { 1709 float x, y; 1710 }; 1711 float foo(struct u1 u) { 1712 return u.x + u.y; 1713 } 1714 1715 We currently generate: 1716 movdqa %xmm0, %xmm1 1717 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0] 1718 addss %xmm1, %xmm0 1719 ret 1720 1721 We could save an instruction here by commuting the addss. 1722 1723 //===---------------------------------------------------------------------===// 1724 1725 This (from PR9661): 1726 1727 float clamp_float(float a) { 1728 if (a > 1.0f) 1729 return 1.0f; 1730 else if (a < 0.0f) 1731 return 0.0f; 1732 else 1733 return a; 1734 } 1735 1736 Could compile to: 1737 1738 clamp_float: # @clamp_float 1739 movss .LCPI0_0(%rip), %xmm1 1740 minss %xmm1, %xmm0 1741 pxor %xmm1, %xmm1 1742 maxss %xmm1, %xmm0 1743 ret 1744 1745 with -ffast-math. 1746 1747 //===---------------------------------------------------------------------===// 1748 1749 This function (from PR9803): 1750 1751 int clamp2(int a) { 1752 if (a > 5) 1753 a = 5; 1754 if (a < 0) 1755 return 0; 1756 return a; 1757 } 1758 1759 Compiles to: 1760 1761 _clamp2: ## @clamp2 1762 pushq %rbp 1763 movq %rsp, %rbp 1764 cmpl $5, %edi 1765 movl $5, %ecx 1766 cmovlel %edi, %ecx 1767 testl %ecx, %ecx 1768 movl $0, %eax 1769 cmovnsl %ecx, %eax 1770 popq %rbp 1771 ret 1772 1773 The move of 0 could be scheduled above the test to make it is xor reg,reg. 1774 1775 //===---------------------------------------------------------------------===// 1776 1777 GCC PR48986. We currently compile this: 1778 1779 void bar(void); 1780 void yyy(int* p) { 1781 if (__sync_fetch_and_add(p, -1) == 1) 1782 bar(); 1783 } 1784 1785 into: 1786 movl $-1, %eax 1787 lock 1788 xaddl %eax, (%rdi) 1789 cmpl $1, %eax 1790 je LBB0_2 1791 1792 Instead we could generate: 1793 1794 lock 1795 dec %rdi 1796 je LBB0_2 1797 1798 The trick is to match "fetch_and_add(X, -C) == C". 1799 1800 //===---------------------------------------------------------------------===// 1801 1802 unsigned t(unsigned a, unsigned b) { 1803 return a <= b ? 5 : -5; 1804 } 1805 1806 We generate: 1807 movl $5, %ecx 1808 cmpl %esi, %edi 1809 movl $-5, %eax 1810 cmovbel %ecx, %eax 1811 1812 GCC: 1813 cmpl %edi, %esi 1814 sbbl %eax, %eax 1815 andl $-10, %eax 1816 addl $5, %eax 1817 1818 //===---------------------------------------------------------------------===// 1819