Home | History | Annotate | Download | only in X86
      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend.
      3 //===---------------------------------------------------------------------===//
      4 
      5 Improvements to the multiply -> shift/add algorithm:
      6 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
      7 
      8 //===---------------------------------------------------------------------===//
      9 
     10 Improve code like this (occurs fairly frequently, e.g. in LLVM):
     11 long long foo(int x) { return 1LL << x; }
     12 
     13 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
     14 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
     15 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
     16 
     17 Another useful one would be  ~0ULL >> X and ~0ULL << X.
     18 
     19 One better solution for 1LL << x is:
     20         xorl    %eax, %eax
     21         xorl    %edx, %edx
     22         testb   $32, %cl
     23         sete    %al
     24         setne   %dl
     25         sall    %cl, %eax
     26         sall    %cl, %edx
     27 
     28 But that requires good 8-bit subreg support.
     29 
     30 Also, this might be better.  It's an extra shift, but it's one instruction
     31 shorter, and doesn't stress 8-bit subreg support.
     32 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
     33 but without the unnecessary and.)
     34         movl %ecx, %eax
     35         shrl $5, %eax
     36         movl %eax, %edx
     37         xorl $1, %edx
     38         sall %cl, %eax
     39         sall %cl. %edx
     40 
     41 64-bit shifts (in general) expand to really bad code.  Instead of using
     42 cmovs, we should expand to a conditional branch like GCC produces.
     43 
     44 //===---------------------------------------------------------------------===//
     45 
     46 Some isel ideas:
     47 
     48 1. Dynamic programming based approach when compile time is not an
     49    issue.
     50 2. Code duplication (addressing mode) during isel.
     51 3. Other ideas from "Register-Sensitive Selection, Duplication, and
     52    Sequencing of Instructions".
     53 4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
     54    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
     55    and other related papers.
     56    http://citeseer.ist.psu.edu/govindarajan01minimum.html
     57 
     58 //===---------------------------------------------------------------------===//
     59 
     60 Should we promote i16 to i32 to avoid partial register update stalls?
     61 
     62 //===---------------------------------------------------------------------===//
     63 
     64 Leave any_extend as pseudo instruction and hint to register
     65 allocator. Delay codegen until post register allocation.
     66 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
     67 the coalescer how to deal with it though.
     68 
     69 //===---------------------------------------------------------------------===//
     70 
     71 It appears icc use push for parameter passing. Need to investigate.
     72 
     73 //===---------------------------------------------------------------------===//
     74 
     75 The instruction selector sometimes misses folding a load into a compare.  The
     76 pattern is written as (cmp reg, (load p)).  Because the compare isn't 
     77 commutative, it is not matched with the load on both sides.  The dag combiner
     78 should be made smart enough to canonicalize the load into the RHS of a compare
     79 when it can invert the result of the compare for free.
     80 
     81 //===---------------------------------------------------------------------===//
     82 
     83 In many cases, LLVM generates code like this:
     84 
     85 _test:
     86         movl 8(%esp), %eax
     87         cmpl %eax, 4(%esp)
     88         setl %al
     89         movzbl %al, %eax
     90         ret
     91 
     92 on some processors (which ones?), it is more efficient to do this:
     93 
     94 _test:
     95         movl 8(%esp), %ebx
     96         xor  %eax, %eax
     97         cmpl %ebx, 4(%esp)
     98         setl %al
     99         ret
    100 
    101 Doing this correctly is tricky though, as the xor clobbers the flags.
    102 
    103 //===---------------------------------------------------------------------===//
    104 
    105 We should generate bts/btr/etc instructions on targets where they are cheap or
    106 when codesize is important.  e.g., for:
    107 
    108 void setbit(int *target, int bit) {
    109     *target |= (1 << bit);
    110 }
    111 void clearbit(int *target, int bit) {
    112     *target &= ~(1 << bit);
    113 }
    114 
    115 //===---------------------------------------------------------------------===//
    116 
    117 Instead of the following for memset char*, 1, 10:
    118 
    119 	movl $16843009, 4(%edx)
    120 	movl $16843009, (%edx)
    121 	movw $257, 8(%edx)
    122 
    123 It might be better to generate
    124 
    125 	movl $16843009, %eax
    126 	movl %eax, 4(%edx)
    127 	movl %eax, (%edx)
    128 	movw al, 8(%edx)
    129 	
    130 when we can spare a register. It reduces code size.
    131 
    132 //===---------------------------------------------------------------------===//
    133 
    134 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
    135 get this:
    136 
    137 define i32 @test1(i32 %X) {
    138     %Y = sdiv i32 %X, 8
    139     ret i32 %Y
    140 }
    141 
    142 _test1:
    143         movl 4(%esp), %eax
    144         movl %eax, %ecx
    145         sarl $31, %ecx
    146         shrl $29, %ecx
    147         addl %ecx, %eax
    148         sarl $3, %eax
    149         ret
    150 
    151 GCC knows several different ways to codegen it, one of which is this:
    152 
    153 _test1:
    154         movl    4(%esp), %eax
    155         cmpl    $-1, %eax
    156         leal    7(%eax), %ecx
    157         cmovle  %ecx, %eax
    158         sarl    $3, %eax
    159         ret
    160 
    161 which is probably slower, but it's interesting at least :)
    162 
    163 //===---------------------------------------------------------------------===//
    164 
    165 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
    166 We should leave these as libcalls for everything over a much lower threshold,
    167 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
    168 stores, TLB preheating, etc)
    169 
    170 //===---------------------------------------------------------------------===//
    171 
    172 Optimize this into something reasonable:
    173  x * copysign(1.0, y) * copysign(1.0, z)
    174 
    175 //===---------------------------------------------------------------------===//
    176 
    177 Optimize copysign(x, *y) to use an integer load from y.
    178 
    179 //===---------------------------------------------------------------------===//
    180 
    181 The following tests perform worse with LSR:
    182 
    183 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
    184 
    185 //===---------------------------------------------------------------------===//
    186 
    187 Adding to the list of cmp / test poor codegen issues:
    188 
    189 int test(__m128 *A, __m128 *B) {
    190   if (_mm_comige_ss(*A, *B))
    191     return 3;
    192   else
    193     return 4;
    194 }
    195 
    196 _test:
    197 	movl 8(%esp), %eax
    198 	movaps (%eax), %xmm0
    199 	movl 4(%esp), %eax
    200 	movaps (%eax), %xmm1
    201 	comiss %xmm0, %xmm1
    202 	setae %al
    203 	movzbl %al, %ecx
    204 	movl $3, %eax
    205 	movl $4, %edx
    206 	cmpl $0, %ecx
    207 	cmove %edx, %eax
    208 	ret
    209 
    210 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
    211 are a number of issues. 1) We are introducing a setcc between the result of the
    212 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
    213 so a any extend (which becomes a zero extend) is added.
    214 
    215 We probably need some kind of target DAG combine hook to fix this.
    216 
    217 //===---------------------------------------------------------------------===//
    218 
    219 We generate significantly worse code for this than GCC:
    220 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
    221 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
    222 
    223 There is also one case we do worse on PPC.
    224 
    225 //===---------------------------------------------------------------------===//
    226 
    227 For this:
    228 
    229 int test(int a)
    230 {
    231   return a * 3;
    232 }
    233 
    234 We currently emits
    235 	imull $3, 4(%esp), %eax
    236 
    237 Perhaps this is what we really should generate is? Is imull three or four
    238 cycles? Note: ICC generates this:
    239 	movl	4(%esp), %eax
    240 	leal	(%eax,%eax,2), %eax
    241 
    242 The current instruction priority is based on pattern complexity. The former is
    243 more "complex" because it folds a load so the latter will not be emitted.
    244 
    245 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
    246 should always try to match LEA first since the LEA matching code does some
    247 estimate to determine whether the match is profitable.
    248 
    249 However, if we care more about code size, then imull is better. It's two bytes
    250 shorter than movl + leal.
    251 
    252 On a Pentium M, both variants have the same characteristics with regard
    253 to throughput; however, the multiplication has a latency of four cycles, as
    254 opposed to two cycles for the movl+lea variant.
    255 
    256 //===---------------------------------------------------------------------===//
    257 
    258 It appears gcc place string data with linkonce linkage in
    259 .section __TEXT,__const_coal,coalesced instead of
    260 .section __DATA,__const_coal,coalesced.
    261 Take a look at darwin.h, there are other Darwin assembler directives that we
    262 do not make use of.
    263 
    264 //===---------------------------------------------------------------------===//
    265 
    266 define i32 @foo(i32* %a, i32 %t) {
    267 entry:
    268 	br label %cond_true
    269 
    270 cond_true:		; preds = %cond_true, %entry
    271 	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
    272 	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
    273 	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
    274 	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
    275 	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
    276 	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
    277 	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
    278 	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
    279 	br i1 %tmp, label %bb12, label %cond_true
    280 
    281 bb12:		; preds = %cond_true
    282 	ret i32 %tmp7
    283 }
    284 is pessimized by -loop-reduce and -indvars
    285 
    286 //===---------------------------------------------------------------------===//
    287 
    288 u32 to float conversion improvement:
    289 
    290 float uint32_2_float( unsigned u ) {
    291   float fl = (int) (u & 0xffff);
    292   float fh = (int) (u >> 16);
    293   fh *= 0x1.0p16f;
    294   return fh + fl;
    295 }
    296 
    297 00000000        subl    $0x04,%esp
    298 00000003        movl    0x08(%esp,1),%eax
    299 00000007        movl    %eax,%ecx
    300 00000009        shrl    $0x10,%ecx
    301 0000000c        cvtsi2ss        %ecx,%xmm0
    302 00000010        andl    $0x0000ffff,%eax
    303 00000015        cvtsi2ss        %eax,%xmm1
    304 00000019        mulss   0x00000078,%xmm0
    305 00000021        addss   %xmm1,%xmm0
    306 00000025        movss   %xmm0,(%esp,1)
    307 0000002a        flds    (%esp,1)
    308 0000002d        addl    $0x04,%esp
    309 00000030        ret
    310 
    311 //===---------------------------------------------------------------------===//
    312 
    313 When using fastcc abi, align stack slot of argument of type double on 8 byte
    314 boundary to improve performance.
    315 
    316 //===---------------------------------------------------------------------===//
    317 
    318 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
    319 simplifications for integer "x cmp y ? a : b".
    320 
    321 //===---------------------------------------------------------------------===//
    322 
    323 Consider the expansion of:
    324 
    325 define i32 @test3(i32 %X) {
    326         %tmp1 = urem i32 %X, 255
    327         ret i32 %tmp1
    328 }
    329 
    330 Currently it compiles to:
    331 
    332 ...
    333         movl $2155905153, %ecx
    334         movl 8(%esp), %esi
    335         movl %esi, %eax
    336         mull %ecx
    337 ...
    338 
    339 This could be "reassociated" into:
    340 
    341         movl $2155905153, %eax
    342         movl 8(%esp), %ecx
    343         mull %ecx
    344 
    345 to avoid the copy.  In fact, the existing two-address stuff would do this
    346 except that mul isn't a commutative 2-addr instruction.  I guess this has
    347 to be done at isel time based on the #uses to mul?
    348 
    349 //===---------------------------------------------------------------------===//
    350 
    351 Make sure the instruction which starts a loop does not cross a cacheline
    352 boundary. This requires knowning the exact length of each machine instruction.
    353 That is somewhat complicated, but doable. Example 256.bzip2:
    354 
    355 In the new trace, the hot loop has an instruction which crosses a cacheline
    356 boundary.  In addition to potential cache misses, this can't help decoding as I
    357 imagine there has to be some kind of complicated decoder reset and realignment
    358 to grab the bytes from the next cacheline.
    359 
    360 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
    361 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
    362 937  937 0x3d0a incl     %esi
    363 3    3   0x3d0b cmpb     %bl, %dl
    364 27   27  0x3d0d jnz      0x000062db <main+11707>
    365 
    366 //===---------------------------------------------------------------------===//
    367 
    368 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
    369 
    370 //===---------------------------------------------------------------------===//
    371 
    372 This could be a single 16-bit load.
    373 
    374 int f(char *p) {
    375     if ((p[0] == 1) & (p[1] == 2)) return 1;
    376     return 0;
    377 }
    378 
    379 //===---------------------------------------------------------------------===//
    380 
    381 We should inline lrintf and probably other libc functions.
    382 
    383 //===---------------------------------------------------------------------===//
    384 
    385 This code:
    386 
    387 void test(int X) {
    388   if (X) abort();
    389 }
    390 
    391 is currently compiled to:
    392 
    393 _test:
    394         subl $12, %esp
    395         cmpl $0, 16(%esp)
    396         jne LBB1_1
    397         addl $12, %esp
    398         ret
    399 LBB1_1:
    400         call L_abort$stub
    401 
    402 It would be better to produce:
    403 
    404 _test:
    405         subl $12, %esp
    406         cmpl $0, 16(%esp)
    407         jne L_abort$stub
    408         addl $12, %esp
    409         ret
    410 
    411 This can be applied to any no-return function call that takes no arguments etc.
    412 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
    413 something like this:
    414 
    415 _test:
    416         cmpl $0, 4(%esp)
    417         jne LBB1_1
    418         ret
    419 LBB1_1:
    420         subl $12, %esp
    421         call L_abort$stub
    422 
    423 Both are useful in different situations.  Finally, it could be shrink-wrapped
    424 and tail called, like this:
    425 
    426 _test:
    427         cmpl $0, 4(%esp)
    428         jne LBB1_1
    429         ret
    430 LBB1_1:
    431         pop %eax   # realign stack.
    432         call L_abort$stub
    433 
    434 Though this probably isn't worth it.
    435 
    436 //===---------------------------------------------------------------------===//
    437 
    438 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
    439 a neg instead of a sub instruction.  Consider:
    440 
    441 int test(char X) { return 7-X; }
    442 
    443 we currently produce:
    444 _test:
    445         movl $7, %eax
    446         movsbl 4(%esp), %ecx
    447         subl %ecx, %eax
    448         ret
    449 
    450 We would use one fewer register if codegen'd as:
    451 
    452         movsbl 4(%esp), %eax
    453 	neg %eax
    454         add $7, %eax
    455         ret
    456 
    457 Note that this isn't beneficial if the load can be folded into the sub.  In
    458 this case, we want a sub:
    459 
    460 int test(int X) { return 7-X; }
    461 _test:
    462         movl $7, %eax
    463         subl 4(%esp), %eax
    464         ret
    465 
    466 //===---------------------------------------------------------------------===//
    467 
    468 Leaf functions that require one 4-byte spill slot have a prolog like this:
    469 
    470 _foo:
    471         pushl   %esi
    472         subl    $4, %esp
    473 ...
    474 and an epilog like this:
    475         addl    $4, %esp
    476         popl    %esi
    477         ret
    478 
    479 It would be smaller, and potentially faster, to push eax on entry and to
    480 pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
    481 into any return registers :)
    482 
    483 //===---------------------------------------------------------------------===//
    484 
    485 The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
    486 branches.  We generate really poor code for:
    487 
    488 double testf(double a) {
    489        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
    490 }
    491 
    492 For example, the entry BB is:
    493 
    494 _testf:
    495         subl    $20, %esp
    496         pxor    %xmm0, %xmm0
    497         movsd   24(%esp), %xmm1
    498         ucomisd %xmm0, %xmm1
    499         setnp   %al
    500         sete    %cl
    501         testb   %cl, %al
    502         jne     LBB1_5  # UnifiedReturnBlock
    503 LBB1_1: # cond_true
    504 
    505 
    506 it would be better to replace the last four instructions with:
    507 
    508 	jp LBB1_1
    509 	je LBB1_5
    510 LBB1_1:
    511 
    512 We also codegen the inner ?: into a diamond:
    513 
    514        cvtss2sd        LCPI1_0(%rip), %xmm2
    515         cvtss2sd        LCPI1_1(%rip), %xmm3
    516         ucomisd %xmm1, %xmm0
    517         ja      LBB1_3  # cond_true
    518 LBB1_2: # cond_true
    519         movapd  %xmm3, %xmm2
    520 LBB1_3: # cond_true
    521         movapd  %xmm2, %xmm0
    522         ret
    523 
    524 We should sink the load into xmm3 into the LBB1_2 block.  This should
    525 be pretty easy, and will nuke all the copies.
    526 
    527 //===---------------------------------------------------------------------===//
    528 
    529 This:
    530         #include <algorithm>
    531         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
    532         { return std::make_pair(a + b, a + b < a); }
    533         bool no_overflow(unsigned a, unsigned b)
    534         { return !full_add(a, b).second; }
    535 
    536 Should compile to:
    537 	addl	%esi, %edi
    538 	setae	%al
    539 	movzbl	%al, %eax
    540 	ret
    541 
    542 on x86-64, instead of the rather stupid-looking:
    543 	addl	%esi, %edi
    544 	setb	%al
    545 	xorb	$1, %al
    546 	movzbl	%al, %eax
    547 	ret
    548 
    549 
    550 //===---------------------------------------------------------------------===//
    551 
    552 The following code:
    553 
    554 bb114.preheader:		; preds = %cond_next94
    555 	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
    556 	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
    557 	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
    558 	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
    559 	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
    560 	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
    561 	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
    562 	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
    563 	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
    564 	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
    565 	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
    566 	br label %bb114
    567 
    568 produces:
    569 
    570 LBB3_5:	# bb114.preheader
    571 	movswl	-68(%ebp), %eax
    572 	movl	$32, %ecx
    573 	movl	%ecx, -80(%ebp)
    574 	subl	%eax, -80(%ebp)
    575 	movswl	-52(%ebp), %eax
    576 	movl	%ecx, -84(%ebp)
    577 	subl	%eax, -84(%ebp)
    578 	movswl	-70(%ebp), %eax
    579 	movl	%ecx, -88(%ebp)
    580 	subl	%eax, -88(%ebp)
    581 	movswl	-50(%ebp), %eax
    582 	subl	%eax, %ecx
    583 	movl	%ecx, -76(%ebp)
    584 	movswl	-42(%ebp), %eax
    585 	movl	%eax, -92(%ebp)
    586 	movswl	-66(%ebp), %eax
    587 	movl	%eax, -96(%ebp)
    588 	movw	$0, -98(%ebp)
    589 
    590 This appears to be bad because the RA is not folding the store to the stack 
    591 slot into the movl.  The above instructions could be:
    592 	movl    $32, -80(%ebp)
    593 ...
    594 	movl    $32, -84(%ebp)
    595 ...
    596 This seems like a cross between remat and spill folding.
    597 
    598 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
    599 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
    600 vice-versa).
    601 
    602 //===---------------------------------------------------------------------===//
    603 
    604 This code:
    605 
    606 	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
    607 	br i1 %tmp659, label %cond_true662, label %cond_next715
    608 
    609 produces this:
    610 
    611 	testw	%cx, %cx
    612 	movswl	%cx, %esi
    613 	jns	LBB4_109	# cond_next715
    614 
    615 Shark tells us that using %cx in the testw instruction is sub-optimal. It
    616 suggests using the 32-bit register (which is what ICC uses).
    617 
    618 //===---------------------------------------------------------------------===//
    619 
    620 We compile this:
    621 
    622 void compare (long long foo) {
    623   if (foo < 4294967297LL)
    624     abort();
    625 }
    626 
    627 to:
    628 
    629 compare:
    630         subl    $4, %esp
    631         cmpl    $0, 8(%esp)
    632         setne   %al
    633         movzbw  %al, %ax
    634         cmpl    $1, 12(%esp)
    635         setg    %cl
    636         movzbw  %cl, %cx
    637         cmove   %ax, %cx
    638         testb   $1, %cl
    639         jne     .LBB1_2 # UnifiedReturnBlock
    640 .LBB1_1:        # ifthen
    641         call    abort
    642 .LBB1_2:        # UnifiedReturnBlock
    643         addl    $4, %esp
    644         ret
    645 
    646 (also really horrible code on ppc).  This is due to the expand code for 64-bit
    647 compares.  GCC produces multiple branches, which is much nicer:
    648 
    649 compare:
    650         subl    $12, %esp
    651         movl    20(%esp), %edx
    652         movl    16(%esp), %eax
    653         decl    %edx
    654         jle     .L7
    655 .L5:
    656         addl    $12, %esp
    657         ret
    658         .p2align 4,,7
    659 .L7:
    660         jl      .L4
    661         cmpl    $0, %eax
    662         .p2align 4,,8
    663         ja      .L5
    664 .L4:
    665         .p2align 4,,9
    666         call    abort
    667 
    668 //===---------------------------------------------------------------------===//
    669 
    670 Tail call optimization improvements: Tail call optimization currently
    671 pushes all arguments on the top of the stack (their normal place for
    672 non-tail call optimized calls) that source from the callers arguments
    673 or  that source from a virtual register (also possibly sourcing from
    674 callers arguments).
    675 This is done to prevent overwriting of parameters (see example
    676 below) that might be used later.
    677 
    678 example:  
    679 
    680 int callee(int32, int64); 
    681 int caller(int32 arg1, int32 arg2) { 
    682   int64 local = arg2 * 2; 
    683   return callee(arg2, (int64)local); 
    684 }
    685 
    686 [arg1]          [!arg2 no longer valid since we moved local onto it]
    687 [arg2]      ->  [(int64)
    688 [RETADDR]        local  ]
    689 
    690 Moving arg1 onto the stack slot of callee function would overwrite
    691 arg2 of the caller.
    692 
    693 Possible optimizations:
    694 
    695 
    696  - Analyse the actual parameters of the callee to see which would
    697    overwrite a caller parameter which is used by the callee and only
    698    push them onto the top of the stack.
    699 
    700    int callee (int32 arg1, int32 arg2);
    701    int caller (int32 arg1, int32 arg2) {
    702        return callee(arg1,arg2);
    703    }
    704 
    705    Here we don't need to write any variables to the top of the stack
    706    since they don't overwrite each other.
    707 
    708    int callee (int32 arg1, int32 arg2);
    709    int caller (int32 arg1, int32 arg2) {
    710        return callee(arg2,arg1);
    711    }
    712 
    713    Here we need to push the arguments because they overwrite each
    714    other.
    715 
    716 //===---------------------------------------------------------------------===//
    717 
    718 main ()
    719 {
    720   int i = 0;
    721   unsigned long int z = 0;
    722 
    723   do {
    724     z -= 0x00004000;
    725     i++;
    726     if (i > 0x00040000)
    727       abort ();
    728   } while (z > 0);
    729   exit (0);
    730 }
    731 
    732 gcc compiles this to:
    733 
    734 _main:
    735 	subl	$28, %esp
    736 	xorl	%eax, %eax
    737 	jmp	L2
    738 L3:
    739 	cmpl	$262144, %eax
    740 	je	L10
    741 L2:
    742 	addl	$1, %eax
    743 	cmpl	$262145, %eax
    744 	jne	L3
    745 	call	L_abort$stub
    746 L10:
    747 	movl	$0, (%esp)
    748 	call	L_exit$stub
    749 
    750 llvm:
    751 
    752 _main:
    753 	subl	$12, %esp
    754 	movl	$1, %eax
    755 	movl	$16384, %ecx
    756 LBB1_1:	# bb
    757 	cmpl	$262145, %eax
    758 	jge	LBB1_4	# cond_true
    759 LBB1_2:	# cond_next
    760 	incl	%eax
    761 	addl	$4294950912, %ecx
    762 	cmpl	$16384, %ecx
    763 	jne	LBB1_1	# bb
    764 LBB1_3:	# bb11
    765 	xorl	%eax, %eax
    766 	addl	$12, %esp
    767 	ret
    768 LBB1_4:	# cond_true
    769 	call	L_abort$stub
    770 
    771 1. LSR should rewrite the first cmp with induction variable %ecx.
    772 2. DAG combiner should fold
    773         leal    1(%eax), %edx
    774         cmpl    $262145, %edx
    775    =>
    776         cmpl    $262144, %eax
    777 
    778 //===---------------------------------------------------------------------===//
    779 
    780 define i64 @test(double %X) {
    781 	%Y = fptosi double %X to i64
    782 	ret i64 %Y
    783 }
    784 
    785 compiles to:
    786 
    787 _test:
    788 	subl	$20, %esp
    789 	movsd	24(%esp), %xmm0
    790 	movsd	%xmm0, 8(%esp)
    791 	fldl	8(%esp)
    792 	fisttpll	(%esp)
    793 	movl	4(%esp), %edx
    794 	movl	(%esp), %eax
    795 	addl	$20, %esp
    796 	#FP_REG_KILL
    797 	ret
    798 
    799 This should just fldl directly from the input stack slot.
    800 
    801 //===---------------------------------------------------------------------===//
    802 
    803 This code:
    804 int foo (int x) { return (x & 65535) | 255; }
    805 
    806 Should compile into:
    807 
    808 _foo:
    809         movzwl  4(%esp), %eax
    810         orl     $255, %eax
    811         ret
    812 
    813 instead of:
    814 _foo:
    815 	movl	$65280, %eax
    816 	andl	4(%esp), %eax
    817 	orl	$255, %eax
    818 	ret
    819 
    820 //===---------------------------------------------------------------------===//
    821 
    822 We're codegen'ing multiply of long longs inefficiently:
    823 
    824 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
    825   return arg1 *  arg2;
    826 }
    827 
    828 We compile to (fomit-frame-pointer):
    829 
    830 _LLM:
    831 	pushl	%esi
    832 	movl	8(%esp), %ecx
    833 	movl	16(%esp), %esi
    834 	movl	%esi, %eax
    835 	mull	%ecx
    836 	imull	12(%esp), %esi
    837 	addl	%edx, %esi
    838 	imull	20(%esp), %ecx
    839 	movl	%esi, %edx
    840 	addl	%ecx, %edx
    841 	popl	%esi
    842 	ret
    843 
    844 This looks like a scheduling deficiency and lack of remat of the load from
    845 the argument area.  ICC apparently produces:
    846 
    847         movl      8(%esp), %ecx
    848         imull     12(%esp), %ecx
    849         movl      16(%esp), %eax
    850         imull     4(%esp), %eax 
    851         addl      %eax, %ecx  
    852         movl      4(%esp), %eax
    853         mull      12(%esp) 
    854         addl      %ecx, %edx
    855         ret
    856 
    857 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
    858 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
    859 
    860 //===---------------------------------------------------------------------===//
    861 
    862 We can fold a store into "zeroing a reg".  Instead of:
    863 
    864 xorl    %eax, %eax
    865 movl    %eax, 124(%esp)
    866 
    867 we should get:
    868 
    869 movl    $0, 124(%esp)
    870 
    871 if the flags of the xor are dead.
    872 
    873 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
    874 be folded into: shl [mem], 1
    875 
    876 //===---------------------------------------------------------------------===//
    877 
    878 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
    879 or and instruction, for example:
    880 
    881 	xorpd	LCPI1_0, %xmm2
    882 
    883 However, if xmm2 gets spilled, we end up with really ugly code like this:
    884 
    885 	movsd	(%esp), %xmm0
    886 	xorpd	LCPI1_0, %xmm0
    887 	movsd	%xmm0, (%esp)
    888 
    889 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
    890 the neg/abs instruction, turning it into an *integer* operation, like this:
    891 
    892 	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
    893 
    894 you could also use xorb, but xorl is less likely to lead to a partial register
    895 stall.  Here is a contrived testcase:
    896 
    897 double a, b, c;
    898 void test(double *P) {
    899   double X = *P;
    900   a = X;
    901   bar();
    902   X = -X;
    903   b = X;
    904   bar();
    905   c = X;
    906 }
    907 
    908 //===---------------------------------------------------------------------===//
    909 
    910 The generated code on x86 for checking for signed overflow on a multiply the
    911 obvious way is much longer than it needs to be.
    912 
    913 int x(int a, int b) {
    914   long long prod = (long long)a*b;
    915   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
    916 }
    917 
    918 See PR2053 for more details.
    919 
    920 //===---------------------------------------------------------------------===//
    921 
    922 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
    923 more aggressively; it should cost the same as a move+shift on any modern
    924 processor, but it's a lot shorter. Downside is that it puts more
    925 pressure on register allocation because it has fixed operands.
    926 
    927 Example:
    928 int abs(int x) {return x < 0 ? -x : x;}
    929 
    930 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
    931 abs:
    932         movl    4(%esp), %eax
    933         cltd
    934         xorl    %edx, %eax
    935         subl    %edx, %eax
    936         ret
    937 
    938 //===---------------------------------------------------------------------===//
    939 
    940 Take the following code (from 
    941 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
    942 
    943 extern unsigned char first_one[65536];
    944 int FirstOnet(unsigned long long arg1)
    945 {
    946   if (arg1 >> 48)
    947     return (first_one[arg1 >> 48]);
    948   return 0;
    949 }
    950 
    951 
    952 The following code is currently generated:
    953 FirstOnet:
    954         movl    8(%esp), %eax
    955         cmpl    $65536, %eax
    956         movl    4(%esp), %ecx
    957         jb      .LBB1_2 # UnifiedReturnBlock
    958 .LBB1_1:        # ifthen
    959         shrl    $16, %eax
    960         movzbl  first_one(%eax), %eax
    961         ret
    962 .LBB1_2:        # UnifiedReturnBlock
    963         xorl    %eax, %eax
    964         ret
    965 
    966 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
    967 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
    968 
    969 //===---------------------------------------------------------------------===//
    970 
    971 We compile this function:
    972 
    973 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
    974 entry:
    975 	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
    976 	br i1 %tmp2, label %bb7, label %bb
    977 
    978 bb:		; preds = %entry
    979 	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
    980 	ret i32 %tmp6
    981 
    982 bb7:		; preds = %entry
    983 	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
    984 	ret i32 %tmp10
    985 }
    986 
    987 to:
    988 
    989 foo:                                    # @foo
    990 # BB#0:                                 # %entry
    991 	movl	4(%esp), %ecx
    992 	cmpb	$0, 16(%esp)
    993 	je	.LBB0_2
    994 # BB#1:                                 # %bb
    995 	movl	8(%esp), %eax
    996 	addl	%ecx, %eax
    997 	ret
    998 .LBB0_2:                                # %bb7
    999 	movl	12(%esp), %edx
   1000 	movl	%ecx, %eax
   1001 	subl	%edx, %eax
   1002 	ret
   1003 
   1004 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
   1005 couple more movls by putting 4(%esp) into %eax instead of %ecx.
   1006 
   1007 //===---------------------------------------------------------------------===//
   1008 
   1009 See rdar://4653682.
   1010 
   1011 From flops:
   1012 
   1013 LBB1_15:        # bb310
   1014         cvtss2sd        LCPI1_0, %xmm1
   1015         addsd   %xmm1, %xmm0
   1016         movsd   176(%esp), %xmm2
   1017         mulsd   %xmm0, %xmm2
   1018         movapd  %xmm2, %xmm3
   1019         mulsd   %xmm3, %xmm3
   1020         movapd  %xmm3, %xmm4
   1021         mulsd   LCPI1_23, %xmm4
   1022         addsd   LCPI1_24, %xmm4
   1023         mulsd   %xmm3, %xmm4
   1024         addsd   LCPI1_25, %xmm4
   1025         mulsd   %xmm3, %xmm4
   1026         addsd   LCPI1_26, %xmm4
   1027         mulsd   %xmm3, %xmm4
   1028         addsd   LCPI1_27, %xmm4
   1029         mulsd   %xmm3, %xmm4
   1030         addsd   LCPI1_28, %xmm4
   1031         mulsd   %xmm3, %xmm4
   1032         addsd   %xmm1, %xmm4
   1033         mulsd   %xmm2, %xmm4
   1034         movsd   152(%esp), %xmm1
   1035         addsd   %xmm4, %xmm1
   1036         movsd   %xmm1, 152(%esp)
   1037         incl    %eax
   1038         cmpl    %eax, %esi
   1039         jge     LBB1_15 # bb310
   1040 LBB1_16:        # bb358.loopexit
   1041         movsd   152(%esp), %xmm0
   1042         addsd   %xmm0, %xmm0
   1043         addsd   LCPI1_22, %xmm0
   1044         movsd   %xmm0, 152(%esp)
   1045 
   1046 Rather than spilling the result of the last addsd in the loop, we should have
   1047 insert a copy to split the interval (one for the duration of the loop, one
   1048 extending to the fall through). The register pressure in the loop isn't high
   1049 enough to warrant the spill.
   1050 
   1051 Also check why xmm7 is not used at all in the function.
   1052 
   1053 //===---------------------------------------------------------------------===//
   1054 
   1055 Take the following:
   1056 
   1057 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
   1058 target triple = "i386-apple-darwin8"
   1059 @in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
   1060 define fastcc void @abort_gzip() noreturn nounwind  {
   1061 entry:
   1062 	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
   1063 	br i1 %tmp.b.i, label %bb.i, label %bb4.i
   1064 bb.i:		; preds = %entry
   1065 	tail call void @exit( i32 1 ) noreturn nounwind 
   1066 	unreachable
   1067 bb4.i:		; preds = %entry
   1068 	store i1 true, i1* @in_exit.4870.b
   1069 	tail call void @exit( i32 1 ) noreturn nounwind 
   1070 	unreachable
   1071 }
   1072 declare void @exit(i32) noreturn nounwind 
   1073 
   1074 This compiles into:
   1075 _abort_gzip:                            ## @abort_gzip
   1076 ## BB#0:                                ## %entry
   1077 	subl	$12, %esp
   1078 	movb	_in_exit.4870.b, %al
   1079 	cmpb	$1, %al
   1080 	jne	LBB0_2
   1081 
   1082 We somehow miss folding the movb into the cmpb.
   1083 
   1084 //===---------------------------------------------------------------------===//
   1085 
   1086 We compile:
   1087 
   1088 int test(int x, int y) {
   1089   return x-y-1;
   1090 }
   1091 
   1092 into (-m64):
   1093 
   1094 _test:
   1095 	decl	%edi
   1096 	movl	%edi, %eax
   1097 	subl	%esi, %eax
   1098 	ret
   1099 
   1100 it would be better to codegen as: x+~y  (notl+addl)
   1101 
   1102 //===---------------------------------------------------------------------===//
   1103 
   1104 This code:
   1105 
   1106 int foo(const char *str,...)
   1107 {
   1108  __builtin_va_list a; int x;
   1109  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
   1110  return x;
   1111 }
   1112 
   1113 gets compiled into this on x86-64:
   1114 	subq    $200, %rsp
   1115         movaps  %xmm7, 160(%rsp)
   1116         movaps  %xmm6, 144(%rsp)
   1117         movaps  %xmm5, 128(%rsp)
   1118         movaps  %xmm4, 112(%rsp)
   1119         movaps  %xmm3, 96(%rsp)
   1120         movaps  %xmm2, 80(%rsp)
   1121         movaps  %xmm1, 64(%rsp)
   1122         movaps  %xmm0, 48(%rsp)
   1123         movq    %r9, 40(%rsp)
   1124         movq    %r8, 32(%rsp)
   1125         movq    %rcx, 24(%rsp)
   1126         movq    %rdx, 16(%rsp)
   1127         movq    %rsi, 8(%rsp)
   1128         leaq    (%rsp), %rax
   1129         movq    %rax, 192(%rsp)
   1130         leaq    208(%rsp), %rax
   1131         movq    %rax, 184(%rsp)
   1132         movl    $48, 180(%rsp)
   1133         movl    $8, 176(%rsp)
   1134         movl    176(%rsp), %eax
   1135         cmpl    $47, %eax
   1136         jbe     .LBB1_3 # bb
   1137 .LBB1_1:        # bb3
   1138         movq    184(%rsp), %rcx
   1139         leaq    8(%rcx), %rax
   1140         movq    %rax, 184(%rsp)
   1141 .LBB1_2:        # bb4
   1142         movl    (%rcx), %eax
   1143         addq    $200, %rsp
   1144         ret
   1145 .LBB1_3:        # bb
   1146         movl    %eax, %ecx
   1147         addl    $8, %eax
   1148         addq    192(%rsp), %rcx
   1149         movl    %eax, 176(%rsp)
   1150         jmp     .LBB1_2 # bb4
   1151 
   1152 gcc 4.3 generates:
   1153 	subq    $96, %rsp
   1154 .LCFI0:
   1155         leaq    104(%rsp), %rax
   1156         movq    %rsi, -80(%rsp)
   1157         movl    $8, -120(%rsp)
   1158         movq    %rax, -112(%rsp)
   1159         leaq    -88(%rsp), %rax
   1160         movq    %rax, -104(%rsp)
   1161         movl    $8, %eax
   1162         cmpl    $48, %eax
   1163         jb      .L6
   1164         movq    -112(%rsp), %rdx
   1165         movl    (%rdx), %eax
   1166         addq    $96, %rsp
   1167         ret
   1168         .p2align 4,,10
   1169         .p2align 3
   1170 .L6:
   1171         mov     %eax, %edx
   1172         addq    -104(%rsp), %rdx
   1173         addl    $8, %eax
   1174         movl    %eax, -120(%rsp)
   1175         movl    (%rdx), %eax
   1176         addq    $96, %rsp
   1177         ret
   1178 
   1179 and it gets compiled into this on x86:
   1180 	pushl   %ebp
   1181         movl    %esp, %ebp
   1182         subl    $4, %esp
   1183         leal    12(%ebp), %eax
   1184         movl    %eax, -4(%ebp)
   1185         leal    16(%ebp), %eax
   1186         movl    %eax, -4(%ebp)
   1187         movl    12(%ebp), %eax
   1188         addl    $4, %esp
   1189         popl    %ebp
   1190         ret
   1191 
   1192 gcc 4.3 generates:
   1193 	pushl   %ebp
   1194         movl    %esp, %ebp
   1195         movl    12(%ebp), %eax
   1196         popl    %ebp
   1197         ret
   1198 
   1199 //===---------------------------------------------------------------------===//
   1200 
   1201 Teach tblgen not to check bitconvert source type in some cases. This allows us
   1202 to consolidate the following patterns in X86InstrMMX.td:
   1203 
   1204 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1205                                                   (iPTR 0))))),
   1206           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
   1207 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1208                                                   (iPTR 0))))),
   1209           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
   1210 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1211                                                   (iPTR 0))))),
   1212           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
   1213 
   1214 There are other cases in various td files.
   1215 
   1216 //===---------------------------------------------------------------------===//
   1217 
   1218 Take something like the following on x86-32:
   1219 unsigned a(unsigned long long x, unsigned y) {return x % y;}
   1220 
   1221 We currently generate a libcall, but we really shouldn't: the expansion is
   1222 shorter and likely faster than the libcall.  The expected code is something
   1223 like the following:
   1224 
   1225 	movl	12(%ebp), %eax
   1226 	movl	16(%ebp), %ecx
   1227 	xorl	%edx, %edx
   1228 	divl	%ecx
   1229 	movl	8(%ebp), %eax
   1230 	divl	%ecx
   1231 	movl	%edx, %eax
   1232 	ret
   1233 
   1234 A similar code sequence works for division.
   1235 
   1236 //===---------------------------------------------------------------------===//
   1237 
   1238 We currently compile this:
   1239 
   1240 define i32 @func1(i32 %v1, i32 %v2) nounwind {
   1241 entry:
   1242   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   1243   %sum = extractvalue {i32, i1} %t, 0
   1244   %obit = extractvalue {i32, i1} %t, 1
   1245   br i1 %obit, label %overflow, label %normal
   1246 normal:
   1247   ret i32 %sum
   1248 overflow:
   1249   call void @llvm.trap()
   1250   unreachable
   1251 }
   1252 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
   1253 declare void @llvm.trap()
   1254 
   1255 to:
   1256 
   1257 _func1:
   1258 	movl	4(%esp), %eax
   1259 	addl	8(%esp), %eax
   1260 	jo	LBB1_2	## overflow
   1261 LBB1_1:	## normal
   1262 	ret
   1263 LBB1_2:	## overflow
   1264 	ud2
   1265 
   1266 it would be nice to produce "into" someday.
   1267 
   1268 //===---------------------------------------------------------------------===//
   1269 
   1270 Test instructions can be eliminated by using EFLAGS values from arithmetic
   1271 instructions. This is currently not done for mul, and, or, xor, neg, shl,
   1272 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
   1273 for read-modify-write instructions. It is also current not done if the
   1274 OF or CF flags are needed.
   1275 
   1276 The shift operators have the complication that when the shift count is
   1277 zero, EFLAGS is not set, so they can only subsume a test instruction if
   1278 the shift count is known to be non-zero. Also, using the EFLAGS value
   1279 from a shift is apparently very slow on some x86 implementations.
   1280 
   1281 In read-modify-write instructions, the root node in the isel match is
   1282 the store, and isel has no way for the use of the EFLAGS result of the
   1283 arithmetic to be remapped to the new node.
   1284 
   1285 Add and subtract instructions set OF on signed overflow and CF on unsiged
   1286 overflow, while test instructions always clear OF and CF. In order to
   1287 replace a test with an add or subtract in a situation where OF or CF is
   1288 needed, codegen must be able to prove that the operation cannot see
   1289 signed or unsigned overflow, respectively.
   1290 
   1291 //===---------------------------------------------------------------------===//
   1292 
   1293 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
   1294 define <16 x float> @foo(<16 x float> %A) nounwind {
   1295 	%tmp = alloca <16 x float>, align 16
   1296 	%tmp2 = alloca <16 x float>, align 16
   1297 	store <16 x float> %A, <16 x float>* %tmp
   1298 	%s = bitcast <16 x float>* %tmp to i8*
   1299 	%s2 = bitcast <16 x float>* %tmp2 to i8*
   1300 	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
   1301 	%R = load <16 x float>* %tmp2
   1302 	ret <16 x float> %R
   1303 }
   1304 
   1305 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
   1306 
   1307 which compiles to:
   1308 
   1309 _foo:
   1310 	subl	$140, %esp
   1311 	movaps	%xmm3, 112(%esp)
   1312 	movaps	%xmm2, 96(%esp)
   1313 	movaps	%xmm1, 80(%esp)
   1314 	movaps	%xmm0, 64(%esp)
   1315 	movl	60(%esp), %eax
   1316 	movl	%eax, 124(%esp)
   1317 	movl	56(%esp), %eax
   1318 	movl	%eax, 120(%esp)
   1319 	movl	52(%esp), %eax
   1320         <many many more 32-bit copies>
   1321       	movaps	(%esp), %xmm0
   1322 	movaps	16(%esp), %xmm1
   1323 	movaps	32(%esp), %xmm2
   1324 	movaps	48(%esp), %xmm3
   1325 	addl	$140, %esp
   1326 	ret
   1327 
   1328 On Nehalem, it may even be cheaper to just use movups when unaligned than to
   1329 fall back to lower-granularity chunks.
   1330 
   1331 //===---------------------------------------------------------------------===//
   1332 
   1333 Implement processor-specific optimizations for parity with GCC on these
   1334 processors.  GCC does two optimizations:
   1335 
   1336 1. ix86_pad_returns inserts a noop before ret instructions if immediately
   1337    preceded by a conditional branch or is the target of a jump.
   1338 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
   1339    code contains more than 3 branches.
   1340    
   1341 The first one is done for all AMDs, Core2, and "Generic"
   1342 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
   1343   Core 2, and "Generic"
   1344 
   1345 //===---------------------------------------------------------------------===//
   1346 Testcase:
   1347 int x(int a) { return (a&0xf0)>>4; }
   1348 
   1349 Current output:
   1350 	movl	4(%esp), %eax
   1351 	shrl	$4, %eax
   1352 	andl	$15, %eax
   1353 	ret
   1354 
   1355 Ideal output:
   1356 	movzbl	4(%esp), %eax
   1357 	shrl	$4, %eax
   1358 	ret
   1359 
   1360 //===---------------------------------------------------------------------===//
   1361 
   1362 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
   1363 properly.
   1364 
   1365 When the return value is not used (i.e. only care about the value in the
   1366 memory), x86 does not have to use add to implement these. Instead, it can use
   1367 add, sub, inc, dec instructions with the "lock" prefix.
   1368 
   1369 This is currently implemented using a bit of instruction selection trick. The
   1370 issue is the target independent pattern produces one output and a chain and we
   1371 want to map it into one that just output a chain. The current trick is to select
   1372 it into a MERGE_VALUES with the first definition being an implicit_def. The
   1373 proper solution is to add new ISD opcodes for the no-output variant. DAG
   1374 combiner can then transform the node before it gets to target node selection.
   1375 
   1376 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
   1377 fact these instructions are identical to the non-lock versions. We need a way to
   1378 add target specific information to target nodes and have this information
   1379 carried over to machine instructions. Asm printer (or JIT) can use this
   1380 information to add the "lock" prefix.
   1381 
   1382 //===---------------------------------------------------------------------===//
   1383 
   1384 struct B {
   1385   unsigned char y0 : 1;
   1386 };
   1387 
   1388 int bar(struct B* a) { return a->y0; }
   1389 
   1390 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
   1391   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
   1392   %2 = load i8* %1, align 1
   1393   %3 = and i8 %2, 1
   1394   %4 = zext i8 %3 to i32
   1395   ret i32 %4
   1396 }
   1397 
   1398 bar:                                    # @bar
   1399 # BB#0:
   1400         movb    (%rdi), %al
   1401         andb    $1, %al
   1402         movzbl  %al, %eax
   1403         ret
   1404 
   1405 Missed optimization: should be movl+andl.
   1406 
   1407 //===---------------------------------------------------------------------===//
   1408 
   1409 The x86_64 abi says:
   1410 
   1411 Booleans, when stored in a memory object, are stored as single byte objects the
   1412 value of which is always 0 (false) or 1 (true).
   1413 
   1414 We are not using this fact:
   1415 
   1416 int bar(_Bool *a) { return *a; }
   1417 
   1418 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
   1419   %1 = load i8* %a, align 1, !tbaa !0
   1420   %tmp = and i8 %1, 1
   1421   %2 = zext i8 %tmp to i32
   1422   ret i32 %2
   1423 }
   1424 
   1425 bar:
   1426         movb    (%rdi), %al
   1427         andb    $1, %al
   1428         movzbl  %al, %eax
   1429         ret
   1430 
   1431 GCC produces
   1432 
   1433 bar:
   1434         movzbl  (%rdi), %eax
   1435         ret
   1436 
   1437 //===---------------------------------------------------------------------===//
   1438 
   1439 Consider the following two functions compiled with clang:
   1440 _Bool foo(int *x) { return !(*x & 4); }
   1441 unsigned bar(int *x) { return !(*x & 4); }
   1442 
   1443 foo:
   1444 	movl	4(%esp), %eax
   1445 	testb	$4, (%eax)
   1446 	sete	%al
   1447 	movzbl	%al, %eax
   1448 	ret
   1449 
   1450 bar:
   1451 	movl	4(%esp), %eax
   1452 	movl	(%eax), %eax
   1453 	shrl	$2, %eax
   1454 	andl	$1, %eax
   1455 	xorl	$1, %eax
   1456 	ret
   1457 
   1458 The second function generates more code even though the two functions are
   1459 are functionally identical.
   1460 
   1461 //===---------------------------------------------------------------------===//
   1462 
   1463 Take the following C code:
   1464 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
   1465 
   1466 We generate the following IR with clang:
   1467 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1468 entry:
   1469   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
   1470   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
   1471   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
   1472   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1473   ret i32 %conv5
   1474 }
   1475 
   1476 And the following x86 code:
   1477 	xorl	%esi, %edi
   1478 	testb	$-1, %dil
   1479 	sete	%al
   1480 	movzbl	%al, %eax
   1481 	ret
   1482 
   1483 A cmpb instead of the xorl+testb would be one instruction shorter.
   1484 
   1485 //===---------------------------------------------------------------------===//
   1486 
   1487 Given the following C code:
   1488 int f(int a, int b) { return (signed char)a == (signed char)b; }
   1489 
   1490 We generate the following IR with clang:
   1491 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1492 entry:
   1493   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
   1494   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
   1495   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
   1496   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
   1497   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
   1498   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1499   ret i32 %conv5
   1500 }
   1501 
   1502 And the following x86 code:
   1503 	movsbl	%sil, %eax
   1504 	movsbl	%dil, %ecx
   1505 	cmpl	%eax, %ecx
   1506 	sete	%al
   1507 	movzbl	%al, %eax
   1508 	ret
   1509 
   1510 
   1511 It should be possible to eliminate the sign extensions.
   1512 
   1513 //===---------------------------------------------------------------------===//
   1514 
   1515 LLVM misses a load+store narrowing opportunity in this code:
   1516 
   1517 %struct.bf = type { i64, i16, i16, i32 }
   1518 
   1519 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
   1520 
   1521 define void @t1() nounwind ssp {
   1522 entry:
   1523   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1524   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
   1525   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
   1526   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
   1527   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
   1528   store i32 %4, i32* %2, align 1
   1529   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1530   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
   1531   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
   1532   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
   1533   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
   1534   store i32 %9, i32* %7, align 1
   1535   ret void
   1536 }
   1537 
   1538 LLVM currently emits this:
   1539 
   1540   movq  bfi(%rip), %rax
   1541   andl  $-65537, 8(%rax)
   1542   movq  bfi(%rip), %rax
   1543   andl  $-131073, 8(%rax)
   1544   ret
   1545 
   1546 It could narrow the loads and stores to emit this:
   1547 
   1548   movq  bfi(%rip), %rax
   1549   andb  $-2, 10(%rax)
   1550   movq  bfi(%rip), %rax
   1551   andb  $-3, 10(%rax)
   1552   ret
   1553 
   1554 The trouble is that there is a TokenFactor between the store and the
   1555 load, making it non-trivial to determine if there's anything between
   1556 the load and the store which would prohibit narrowing.
   1557 
   1558 //===---------------------------------------------------------------------===//
   1559 
   1560 This code:
   1561 void foo(unsigned x) {
   1562   if (x == 0) bar();
   1563   else if (x == 1) qux();
   1564 }
   1565 
   1566 currently compiles into:
   1567 _foo:
   1568 	movl	4(%esp), %eax
   1569 	cmpl	$1, %eax
   1570 	je	LBB0_3
   1571 	testl	%eax, %eax
   1572 	jne	LBB0_4
   1573 
   1574 the testl could be removed:
   1575 _foo:
   1576 	movl	4(%esp), %eax
   1577 	cmpl	$1, %eax
   1578 	je	LBB0_3
   1579 	jb	LBB0_4
   1580 
   1581 0 is the only unsigned number < 1.
   1582 
   1583 //===---------------------------------------------------------------------===//
   1584 
   1585 This code:
   1586 
   1587 %0 = type { i32, i1 }
   1588 
   1589 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
   1590 entry:
   1591   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
   1592   %cmp = extractvalue %0 %uadd, 1
   1593   %inc = zext i1 %cmp to i32
   1594   %add = add i32 %x, %sum
   1595   %z.0 = add i32 %add, %inc
   1596   ret i32 %z.0
   1597 }
   1598 
   1599 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
   1600 
   1601 compiles to:
   1602 
   1603 _add32carry:                            ## @add32carry
   1604 	addl	%esi, %edi
   1605 	sbbl	%ecx, %ecx
   1606 	movl	%edi, %eax
   1607 	subl	%ecx, %eax
   1608 	ret
   1609 
   1610 But it could be:
   1611 
   1612 _add32carry:
   1613 	leal	(%rsi,%rdi), %eax
   1614 	cmpl	%esi, %eax
   1615 	adcl	$0, %eax
   1616 	ret
   1617 
   1618 //===---------------------------------------------------------------------===//
   1619 
   1620 The hot loop of 256.bzip2 contains code that looks a bit like this:
   1621 
   1622 int foo(char *P, char *Q, int x, int y) {
   1623   if (P[0] != Q[0])
   1624      return P[0] < Q[0];
   1625   if (P[1] != Q[1])
   1626      return P[1] < Q[1];
   1627   if (P[2] != Q[2])
   1628      return P[2] < Q[2];
   1629    return P[3] < Q[3];
   1630 }
   1631 
   1632 In the real code, we get a lot more wrong than this.  However, even in this
   1633 code we generate:
   1634 
   1635 _foo:                                   ## @foo
   1636 ## BB#0:                                ## %entry
   1637 	movb	(%rsi), %al
   1638 	movb	(%rdi), %cl
   1639 	cmpb	%al, %cl
   1640 	je	LBB0_2
   1641 LBB0_1:                                 ## %if.then
   1642 	cmpb	%al, %cl
   1643 	jmp	LBB0_5
   1644 LBB0_2:                                 ## %if.end
   1645 	movb	1(%rsi), %al
   1646 	movb	1(%rdi), %cl
   1647 	cmpb	%al, %cl
   1648 	jne	LBB0_1
   1649 ## BB#3:                                ## %if.end38
   1650 	movb	2(%rsi), %al
   1651 	movb	2(%rdi), %cl
   1652 	cmpb	%al, %cl
   1653 	jne	LBB0_1
   1654 ## BB#4:                                ## %if.end60
   1655 	movb	3(%rdi), %al
   1656 	cmpb	3(%rsi), %al
   1657 LBB0_5:                                 ## %if.end60
   1658 	setl	%al
   1659 	movzbl	%al, %eax
   1660 	ret
   1661 
   1662 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
   1663 redundant compare also forces the register values to be live, which prevents
   1664 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
   1665 
   1666 _foo:
   1667 	movzbl	(%rsi), %eax
   1668 	cmpb	%al, (%rdi)
   1669 	jne	L10
   1670 L12:
   1671 	movzbl	1(%rsi), %eax
   1672 	cmpb	%al, 1(%rdi)
   1673 	jne	L10
   1674 	movzbl	2(%rsi), %eax
   1675 	cmpb	%al, 2(%rdi)
   1676 	jne	L10
   1677 	movzbl	3(%rdi), %eax
   1678 	cmpb	3(%rsi), %al
   1679 L10:
   1680 	setl	%al
   1681 	movzbl	%al, %eax
   1682 	ret
   1683 
   1684 which is "perfect".
   1685 
   1686 //===---------------------------------------------------------------------===//
   1687 
   1688 For the branch in the following code:
   1689 int a();
   1690 int b(int x, int y) {
   1691   if (x & (1<<(y&7)))
   1692     return a();
   1693   return y;
   1694 }
   1695 
   1696 We currently generate:
   1697 	movb	%sil, %al
   1698 	andb	$7, %al
   1699 	movzbl	%al, %eax
   1700 	btl	%eax, %edi
   1701 	jae	.LBB0_2
   1702 
   1703 movl+andl would be shorter than the movb+andb+movzbl sequence.
   1704 
   1705 //===---------------------------------------------------------------------===//
   1706 
   1707 For the following:
   1708 struct u1 {
   1709     float x, y;
   1710 };
   1711 float foo(struct u1 u) {
   1712     return u.x + u.y;
   1713 }
   1714 
   1715 We currently generate:
   1716 	movdqa	%xmm0, %xmm1
   1717 	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
   1718 	addss	%xmm1, %xmm0
   1719 	ret
   1720 
   1721 We could save an instruction here by commuting the addss.
   1722 
   1723 //===---------------------------------------------------------------------===//
   1724 
   1725 This (from PR9661):
   1726 
   1727 float clamp_float(float a) {
   1728         if (a > 1.0f)
   1729                 return 1.0f;
   1730         else if (a < 0.0f)
   1731                 return 0.0f;
   1732         else
   1733                 return a;
   1734 }
   1735 
   1736 Could compile to:
   1737 
   1738 clamp_float:                            # @clamp_float
   1739         movss   .LCPI0_0(%rip), %xmm1
   1740         minss   %xmm1, %xmm0
   1741         pxor    %xmm1, %xmm1
   1742         maxss   %xmm1, %xmm0
   1743         ret
   1744 
   1745 with -ffast-math.
   1746 
   1747 //===---------------------------------------------------------------------===//
   1748 
   1749 This function (from PR9803):
   1750 
   1751 int clamp2(int a) {
   1752         if (a > 5)
   1753                 a = 5;
   1754         if (a < 0) 
   1755                 return 0;
   1756         return a;
   1757 }
   1758 
   1759 Compiles to:
   1760 
   1761 _clamp2:                                ## @clamp2
   1762         pushq   %rbp
   1763         movq    %rsp, %rbp
   1764         cmpl    $5, %edi
   1765         movl    $5, %ecx
   1766         cmovlel %edi, %ecx
   1767         testl   %ecx, %ecx
   1768         movl    $0, %eax
   1769         cmovnsl %ecx, %eax
   1770         popq    %rbp
   1771         ret
   1772 
   1773 The move of 0 could be scheduled above the test to make it is xor reg,reg.
   1774 
   1775 //===---------------------------------------------------------------------===//
   1776 
   1777 GCC PR48986.  We currently compile this:
   1778 
   1779 void bar(void);
   1780 void yyy(int* p) {
   1781     if (__sync_fetch_and_add(p, -1) == 1)
   1782       bar();
   1783 }
   1784 
   1785 into:
   1786 	movl	$-1, %eax
   1787 	lock
   1788 	xaddl	%eax, (%rdi)
   1789 	cmpl	$1, %eax
   1790 	je	LBB0_2
   1791 
   1792 Instead we could generate:
   1793 
   1794 	lock
   1795 	dec %rdi
   1796 	je LBB0_2
   1797 
   1798 The trick is to match "fetch_and_add(X, -C) == C".
   1799 
   1800 //===---------------------------------------------------------------------===//
   1801 
   1802 unsigned t(unsigned a, unsigned b) {
   1803   return a <= b ? 5 : -5;
   1804 }
   1805 
   1806 We generate:
   1807 	movl	$5, %ecx
   1808 	cmpl	%esi, %edi
   1809 	movl	$-5, %eax
   1810 	cmovbel	%ecx, %eax
   1811 
   1812 GCC:
   1813 	cmpl	%edi, %esi
   1814 	sbbl	%eax, %eax
   1815 	andl	$-10, %eax
   1816 	addl	$5, %eax
   1817 
   1818 //===---------------------------------------------------------------------===//
   1819