Home | History | Annotate | Download | only in X86
      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the X86 backend.
      3 //===---------------------------------------------------------------------===//
      4 
      5 This should be one DIV/IDIV instruction, not a libcall:
      6 
      7 unsigned test(unsigned long long X, unsigned Y) {
      8         return X/Y;
      9 }
     10 
     11 This can be done trivially with a custom legalizer.  What about overflow 
     12 though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
     13 
     14 //===---------------------------------------------------------------------===//
     15 
     16 Improvements to the multiply -> shift/add algorithm:
     17 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
     18 
     19 //===---------------------------------------------------------------------===//
     20 
     21 Improve code like this (occurs fairly frequently, e.g. in LLVM):
     22 long long foo(int x) { return 1LL << x; }
     23 
     24 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
     25 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
     26 http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
     27 
     28 Another useful one would be  ~0ULL >> X and ~0ULL << X.
     29 
     30 One better solution for 1LL << x is:
     31         xorl    %eax, %eax
     32         xorl    %edx, %edx
     33         testb   $32, %cl
     34         sete    %al
     35         setne   %dl
     36         sall    %cl, %eax
     37         sall    %cl, %edx
     38 
     39 But that requires good 8-bit subreg support.
     40 
     41 Also, this might be better.  It's an extra shift, but it's one instruction
     42 shorter, and doesn't stress 8-bit subreg support.
     43 (From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
     44 but without the unnecessary and.)
     45         movl %ecx, %eax
     46         shrl $5, %eax
     47         movl %eax, %edx
     48         xorl $1, %edx
     49         sall %cl, %eax
     50         sall %cl. %edx
     51 
     52 64-bit shifts (in general) expand to really bad code.  Instead of using
     53 cmovs, we should expand to a conditional branch like GCC produces.
     54 
     55 //===---------------------------------------------------------------------===//
     56 
     57 Some isel ideas:
     58 
     59 1. Dynamic programming based approach when compile time if not an
     60    issue.
     61 2. Code duplication (addressing mode) during isel.
     62 3. Other ideas from "Register-Sensitive Selection, Duplication, and
     63    Sequencing of Instructions".
     64 4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
     65    Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
     66    and other related papers.
     67    http://citeseer.ist.psu.edu/govindarajan01minimum.html
     68 
     69 //===---------------------------------------------------------------------===//
     70 
     71 Should we promote i16 to i32 to avoid partial register update stalls?
     72 
     73 //===---------------------------------------------------------------------===//
     74 
     75 Leave any_extend as pseudo instruction and hint to register
     76 allocator. Delay codegen until post register allocation.
     77 Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
     78 the coalescer how to deal with it though.
     79 
     80 //===---------------------------------------------------------------------===//
     81 
     82 It appears icc use push for parameter passing. Need to investigate.
     83 
     84 //===---------------------------------------------------------------------===//
     85 
     86 This:
     87 
     88 void foo(void);
     89 void bar(int x, int *P) { 
     90   x >>= 2;
     91   if (x) 
     92     foo();
     93   *P = x;
     94 }
     95 
     96 compiles into:
     97 
     98 	movq	%rsi, %rbx
     99 	movl	%edi, %r14d
    100 	sarl	$2, %r14d
    101 	testl	%r14d, %r14d
    102 	je	LBB0_2
    103 
    104 Instead of doing an explicit test, we can use the flags off the sar.  This
    105 occurs in a bigger testcase like this, which is pretty common:
    106 
    107 #include <vector>
    108 int test1(std::vector<int> &X) {
    109   int Sum = 0;
    110   for (long i = 0, e = X.size(); i != e; ++i)
    111     X[i] = 0;
    112   return Sum;
    113 }
    114 
    115 //===---------------------------------------------------------------------===//
    116 
    117 Only use inc/neg/not instructions on processors where they are faster than
    118 add/sub/xor.  They are slower on the P4 due to only updating some processor
    119 flags.
    120 
    121 //===---------------------------------------------------------------------===//
    122 
    123 The instruction selector sometimes misses folding a load into a compare.  The
    124 pattern is written as (cmp reg, (load p)).  Because the compare isn't 
    125 commutative, it is not matched with the load on both sides.  The dag combiner
    126 should be made smart enough to cannonicalize the load into the RHS of a compare
    127 when it can invert the result of the compare for free.
    128 
    129 //===---------------------------------------------------------------------===//
    130 
    131 In many cases, LLVM generates code like this:
    132 
    133 _test:
    134         movl 8(%esp), %eax
    135         cmpl %eax, 4(%esp)
    136         setl %al
    137         movzbl %al, %eax
    138         ret
    139 
    140 on some processors (which ones?), it is more efficient to do this:
    141 
    142 _test:
    143         movl 8(%esp), %ebx
    144         xor  %eax, %eax
    145         cmpl %ebx, 4(%esp)
    146         setl %al
    147         ret
    148 
    149 Doing this correctly is tricky though, as the xor clobbers the flags.
    150 
    151 //===---------------------------------------------------------------------===//
    152 
    153 We should generate bts/btr/etc instructions on targets where they are cheap or
    154 when codesize is important.  e.g., for:
    155 
    156 void setbit(int *target, int bit) {
    157     *target |= (1 << bit);
    158 }
    159 void clearbit(int *target, int bit) {
    160     *target &= ~(1 << bit);
    161 }
    162 
    163 //===---------------------------------------------------------------------===//
    164 
    165 Instead of the following for memset char*, 1, 10:
    166 
    167 	movl $16843009, 4(%edx)
    168 	movl $16843009, (%edx)
    169 	movw $257, 8(%edx)
    170 
    171 It might be better to generate
    172 
    173 	movl $16843009, %eax
    174 	movl %eax, 4(%edx)
    175 	movl %eax, (%edx)
    176 	movw al, 8(%edx)
    177 	
    178 when we can spare a register. It reduces code size.
    179 
    180 //===---------------------------------------------------------------------===//
    181 
    182 Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
    183 get this:
    184 
    185 define i32 @test1(i32 %X) {
    186     %Y = sdiv i32 %X, 8
    187     ret i32 %Y
    188 }
    189 
    190 _test1:
    191         movl 4(%esp), %eax
    192         movl %eax, %ecx
    193         sarl $31, %ecx
    194         shrl $29, %ecx
    195         addl %ecx, %eax
    196         sarl $3, %eax
    197         ret
    198 
    199 GCC knows several different ways to codegen it, one of which is this:
    200 
    201 _test1:
    202         movl    4(%esp), %eax
    203         cmpl    $-1, %eax
    204         leal    7(%eax), %ecx
    205         cmovle  %ecx, %eax
    206         sarl    $3, %eax
    207         ret
    208 
    209 which is probably slower, but it's interesting at least :)
    210 
    211 //===---------------------------------------------------------------------===//
    212 
    213 We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
    214 We should leave these as libcalls for everything over a much lower threshold,
    215 since libc is hand tuned for medium and large mem ops (avoiding RFO for large
    216 stores, TLB preheating, etc)
    217 
    218 //===---------------------------------------------------------------------===//
    219 
    220 Optimize this into something reasonable:
    221  x * copysign(1.0, y) * copysign(1.0, z)
    222 
    223 //===---------------------------------------------------------------------===//
    224 
    225 Optimize copysign(x, *y) to use an integer load from y.
    226 
    227 //===---------------------------------------------------------------------===//
    228 
    229 The following tests perform worse with LSR:
    230 
    231 lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
    232 
    233 //===---------------------------------------------------------------------===//
    234 
    235 Adding to the list of cmp / test poor codegen issues:
    236 
    237 int test(__m128 *A, __m128 *B) {
    238   if (_mm_comige_ss(*A, *B))
    239     return 3;
    240   else
    241     return 4;
    242 }
    243 
    244 _test:
    245 	movl 8(%esp), %eax
    246 	movaps (%eax), %xmm0
    247 	movl 4(%esp), %eax
    248 	movaps (%eax), %xmm1
    249 	comiss %xmm0, %xmm1
    250 	setae %al
    251 	movzbl %al, %ecx
    252 	movl $3, %eax
    253 	movl $4, %edx
    254 	cmpl $0, %ecx
    255 	cmove %edx, %eax
    256 	ret
    257 
    258 Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
    259 are a number of issues. 1) We are introducing a setcc between the result of the
    260 intrisic call and select. 2) The intrinsic is expected to produce a i32 value
    261 so a any extend (which becomes a zero extend) is added.
    262 
    263 We probably need some kind of target DAG combine hook to fix this.
    264 
    265 //===---------------------------------------------------------------------===//
    266 
    267 We generate significantly worse code for this than GCC:
    268 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
    269 http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
    270 
    271 There is also one case we do worse on PPC.
    272 
    273 //===---------------------------------------------------------------------===//
    274 
    275 For this:
    276 
    277 int test(int a)
    278 {
    279   return a * 3;
    280 }
    281 
    282 We currently emits
    283 	imull $3, 4(%esp), %eax
    284 
    285 Perhaps this is what we really should generate is? Is imull three or four
    286 cycles? Note: ICC generates this:
    287 	movl	4(%esp), %eax
    288 	leal	(%eax,%eax,2), %eax
    289 
    290 The current instruction priority is based on pattern complexity. The former is
    291 more "complex" because it folds a load so the latter will not be emitted.
    292 
    293 Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
    294 should always try to match LEA first since the LEA matching code does some
    295 estimate to determine whether the match is profitable.
    296 
    297 However, if we care more about code size, then imull is better. It's two bytes
    298 shorter than movl + leal.
    299 
    300 On a Pentium M, both variants have the same characteristics with regard
    301 to throughput; however, the multiplication has a latency of four cycles, as
    302 opposed to two cycles for the movl+lea variant.
    303 
    304 //===---------------------------------------------------------------------===//
    305 
    306 __builtin_ffs codegen is messy.
    307 
    308 int ffs_(unsigned X) { return __builtin_ffs(X); }
    309 
    310 llvm produces:
    311 ffs_:
    312         movl    4(%esp), %ecx
    313         bsfl    %ecx, %eax
    314         movl    $32, %edx
    315         cmove   %edx, %eax
    316         incl    %eax
    317         xorl    %edx, %edx
    318         testl   %ecx, %ecx
    319         cmove   %edx, %eax
    320         ret
    321 
    322 vs gcc:
    323 
    324 _ffs_:
    325         movl    $-1, %edx
    326         bsfl    4(%esp), %eax
    327         cmove   %edx, %eax
    328         addl    $1, %eax
    329         ret
    330 
    331 Another example of __builtin_ffs (use predsimplify to eliminate a select):
    332 
    333 int foo (unsigned long j) {
    334   if (j)
    335     return __builtin_ffs (j) - 1;
    336   else
    337     return 0;
    338 }
    339 
    340 //===---------------------------------------------------------------------===//
    341 
    342 It appears gcc place string data with linkonce linkage in
    343 .section __TEXT,__const_coal,coalesced instead of
    344 .section __DATA,__const_coal,coalesced.
    345 Take a look at darwin.h, there are other Darwin assembler directives that we
    346 do not make use of.
    347 
    348 //===---------------------------------------------------------------------===//
    349 
    350 define i32 @foo(i32* %a, i32 %t) {
    351 entry:
    352 	br label %cond_true
    353 
    354 cond_true:		; preds = %cond_true, %entry
    355 	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
    356 	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
    357 	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
    358 	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
    359 	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
    360 	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
    361 	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
    362 	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
    363 	br i1 %tmp, label %bb12, label %cond_true
    364 
    365 bb12:		; preds = %cond_true
    366 	ret i32 %tmp7
    367 }
    368 is pessimized by -loop-reduce and -indvars
    369 
    370 //===---------------------------------------------------------------------===//
    371 
    372 u32 to float conversion improvement:
    373 
    374 float uint32_2_float( unsigned u ) {
    375   float fl = (int) (u & 0xffff);
    376   float fh = (int) (u >> 16);
    377   fh *= 0x1.0p16f;
    378   return fh + fl;
    379 }
    380 
    381 00000000        subl    $0x04,%esp
    382 00000003        movl    0x08(%esp,1),%eax
    383 00000007        movl    %eax,%ecx
    384 00000009        shrl    $0x10,%ecx
    385 0000000c        cvtsi2ss        %ecx,%xmm0
    386 00000010        andl    $0x0000ffff,%eax
    387 00000015        cvtsi2ss        %eax,%xmm1
    388 00000019        mulss   0x00000078,%xmm0
    389 00000021        addss   %xmm1,%xmm0
    390 00000025        movss   %xmm0,(%esp,1)
    391 0000002a        flds    (%esp,1)
    392 0000002d        addl    $0x04,%esp
    393 00000030        ret
    394 
    395 //===---------------------------------------------------------------------===//
    396 
    397 When using fastcc abi, align stack slot of argument of type double on 8 byte
    398 boundary to improve performance.
    399 
    400 //===---------------------------------------------------------------------===//
    401 
    402 GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
    403 simplifications for integer "x cmp y ? a : b".
    404 
    405 //===---------------------------------------------------------------------===//
    406 
    407 Consider the expansion of:
    408 
    409 define i32 @test3(i32 %X) {
    410         %tmp1 = urem i32 %X, 255
    411         ret i32 %tmp1
    412 }
    413 
    414 Currently it compiles to:
    415 
    416 ...
    417         movl $2155905153, %ecx
    418         movl 8(%esp), %esi
    419         movl %esi, %eax
    420         mull %ecx
    421 ...
    422 
    423 This could be "reassociated" into:
    424 
    425         movl $2155905153, %eax
    426         movl 8(%esp), %ecx
    427         mull %ecx
    428 
    429 to avoid the copy.  In fact, the existing two-address stuff would do this
    430 except that mul isn't a commutative 2-addr instruction.  I guess this has
    431 to be done at isel time based on the #uses to mul?
    432 
    433 //===---------------------------------------------------------------------===//
    434 
    435 Make sure the instruction which starts a loop does not cross a cacheline
    436 boundary. This requires knowning the exact length of each machine instruction.
    437 That is somewhat complicated, but doable. Example 256.bzip2:
    438 
    439 In the new trace, the hot loop has an instruction which crosses a cacheline
    440 boundary.  In addition to potential cache misses, this can't help decoding as I
    441 imagine there has to be some kind of complicated decoder reset and realignment
    442 to grab the bytes from the next cacheline.
    443 
    444 532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
    445 942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
    446 937  937 0x3d0a incl     %esi
    447 3    3   0x3d0b cmpb     %bl, %dl
    448 27   27  0x3d0d jnz      0x000062db <main+11707>
    449 
    450 //===---------------------------------------------------------------------===//
    451 
    452 In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
    453 
    454 //===---------------------------------------------------------------------===//
    455 
    456 This could be a single 16-bit load.
    457 
    458 int f(char *p) {
    459     if ((p[0] == 1) & (p[1] == 2)) return 1;
    460     return 0;
    461 }
    462 
    463 //===---------------------------------------------------------------------===//
    464 
    465 We should inline lrintf and probably other libc functions.
    466 
    467 //===---------------------------------------------------------------------===//
    468 
    469 Use the FLAGS values from arithmetic instructions more.  For example, compile:
    470 
    471 int add_zf(int *x, int y, int a, int b) {
    472      if ((*x += y) == 0)
    473           return a;
    474      else
    475           return b;
    476 }
    477 
    478 to:
    479        addl    %esi, (%rdi)
    480        movl    %edx, %eax
    481        cmovne  %ecx, %eax
    482        ret
    483 instead of:
    484 
    485 _add_zf:
    486         addl (%rdi), %esi
    487         movl %esi, (%rdi)
    488         testl %esi, %esi
    489         cmove %edx, %ecx
    490         movl %ecx, %eax
    491         ret
    492 
    493 As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
    494 without a test instruction.
    495 
    496 //===---------------------------------------------------------------------===//
    497 
    498 These two functions have identical effects:
    499 
    500 unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
    501 unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
    502 
    503 We currently compile them to:
    504 
    505 _f:
    506         movl 4(%esp), %eax
    507         movl %eax, %ecx
    508         incl %ecx
    509         movl 8(%esp), %edx
    510         cmpl %edx, %ecx
    511         jne LBB1_2      #UnifiedReturnBlock
    512 LBB1_1: #cond_true
    513         addl $2, %eax
    514         ret
    515 LBB1_2: #UnifiedReturnBlock
    516         movl %ecx, %eax
    517         ret
    518 _f2:
    519         movl 4(%esp), %eax
    520         movl %eax, %ecx
    521         incl %ecx
    522         cmpl 8(%esp), %ecx
    523         sete %cl
    524         movzbl %cl, %ecx
    525         leal 1(%ecx,%eax), %eax
    526         ret
    527 
    528 both of which are inferior to GCC's:
    529 
    530 _f:
    531         movl    4(%esp), %edx
    532         leal    1(%edx), %eax
    533         addl    $2, %edx
    534         cmpl    8(%esp), %eax
    535         cmove   %edx, %eax
    536         ret
    537 _f2:
    538         movl    4(%esp), %eax
    539         addl    $1, %eax
    540         xorl    %edx, %edx
    541         cmpl    8(%esp), %eax
    542         sete    %dl
    543         addl    %edx, %eax
    544         ret
    545 
    546 //===---------------------------------------------------------------------===//
    547 
    548 This code:
    549 
    550 void test(int X) {
    551   if (X) abort();
    552 }
    553 
    554 is currently compiled to:
    555 
    556 _test:
    557         subl $12, %esp
    558         cmpl $0, 16(%esp)
    559         jne LBB1_1
    560         addl $12, %esp
    561         ret
    562 LBB1_1:
    563         call L_abort$stub
    564 
    565 It would be better to produce:
    566 
    567 _test:
    568         subl $12, %esp
    569         cmpl $0, 16(%esp)
    570         jne L_abort$stub
    571         addl $12, %esp
    572         ret
    573 
    574 This can be applied to any no-return function call that takes no arguments etc.
    575 Alternatively, the stack save/restore logic could be shrink-wrapped, producing
    576 something like this:
    577 
    578 _test:
    579         cmpl $0, 4(%esp)
    580         jne LBB1_1
    581         ret
    582 LBB1_1:
    583         subl $12, %esp
    584         call L_abort$stub
    585 
    586 Both are useful in different situations.  Finally, it could be shrink-wrapped
    587 and tail called, like this:
    588 
    589 _test:
    590         cmpl $0, 4(%esp)
    591         jne LBB1_1
    592         ret
    593 LBB1_1:
    594         pop %eax   # realign stack.
    595         call L_abort$stub
    596 
    597 Though this probably isn't worth it.
    598 
    599 //===---------------------------------------------------------------------===//
    600 
    601 Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
    602 a neg instead of a sub instruction.  Consider:
    603 
    604 int test(char X) { return 7-X; }
    605 
    606 we currently produce:
    607 _test:
    608         movl $7, %eax
    609         movsbl 4(%esp), %ecx
    610         subl %ecx, %eax
    611         ret
    612 
    613 We would use one fewer register if codegen'd as:
    614 
    615         movsbl 4(%esp), %eax
    616 	neg %eax
    617         add $7, %eax
    618         ret
    619 
    620 Note that this isn't beneficial if the load can be folded into the sub.  In
    621 this case, we want a sub:
    622 
    623 int test(int X) { return 7-X; }
    624 _test:
    625         movl $7, %eax
    626         subl 4(%esp), %eax
    627         ret
    628 
    629 //===---------------------------------------------------------------------===//
    630 
    631 Leaf functions that require one 4-byte spill slot have a prolog like this:
    632 
    633 _foo:
    634         pushl   %esi
    635         subl    $4, %esp
    636 ...
    637 and an epilog like this:
    638         addl    $4, %esp
    639         popl    %esi
    640         ret
    641 
    642 It would be smaller, and potentially faster, to push eax on entry and to
    643 pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
    644 into any return registers :)
    645 
    646 //===---------------------------------------------------------------------===//
    647 
    648 The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
    649 branches.  We generate really poor code for:
    650 
    651 double testf(double a) {
    652        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
    653 }
    654 
    655 For example, the entry BB is:
    656 
    657 _testf:
    658         subl    $20, %esp
    659         pxor    %xmm0, %xmm0
    660         movsd   24(%esp), %xmm1
    661         ucomisd %xmm0, %xmm1
    662         setnp   %al
    663         sete    %cl
    664         testb   %cl, %al
    665         jne     LBB1_5  # UnifiedReturnBlock
    666 LBB1_1: # cond_true
    667 
    668 
    669 it would be better to replace the last four instructions with:
    670 
    671 	jp LBB1_1
    672 	je LBB1_5
    673 LBB1_1:
    674 
    675 We also codegen the inner ?: into a diamond:
    676 
    677        cvtss2sd        LCPI1_0(%rip), %xmm2
    678         cvtss2sd        LCPI1_1(%rip), %xmm3
    679         ucomisd %xmm1, %xmm0
    680         ja      LBB1_3  # cond_true
    681 LBB1_2: # cond_true
    682         movapd  %xmm3, %xmm2
    683 LBB1_3: # cond_true
    684         movapd  %xmm2, %xmm0
    685         ret
    686 
    687 We should sink the load into xmm3 into the LBB1_2 block.  This should
    688 be pretty easy, and will nuke all the copies.
    689 
    690 //===---------------------------------------------------------------------===//
    691 
    692 This:
    693         #include <algorithm>
    694         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
    695         { return std::make_pair(a + b, a + b < a); }
    696         bool no_overflow(unsigned a, unsigned b)
    697         { return !full_add(a, b).second; }
    698 
    699 Should compile to:
    700 	addl	%esi, %edi
    701 	setae	%al
    702 	movzbl	%al, %eax
    703 	ret
    704 
    705 on x86-64, instead of the rather stupid-looking:
    706 	addl	%esi, %edi
    707 	setb	%al
    708 	xorb	$1, %al
    709 	movzbl	%al, %eax
    710 	ret
    711 
    712 
    713 //===---------------------------------------------------------------------===//
    714 
    715 The following code:
    716 
    717 bb114.preheader:		; preds = %cond_next94
    718 	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
    719 	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
    720 	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
    721 	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
    722 	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
    723 	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
    724 	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
    725 	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
    726 	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
    727 	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
    728 	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
    729 	br label %bb114
    730 
    731 produces:
    732 
    733 LBB3_5:	# bb114.preheader
    734 	movswl	-68(%ebp), %eax
    735 	movl	$32, %ecx
    736 	movl	%ecx, -80(%ebp)
    737 	subl	%eax, -80(%ebp)
    738 	movswl	-52(%ebp), %eax
    739 	movl	%ecx, -84(%ebp)
    740 	subl	%eax, -84(%ebp)
    741 	movswl	-70(%ebp), %eax
    742 	movl	%ecx, -88(%ebp)
    743 	subl	%eax, -88(%ebp)
    744 	movswl	-50(%ebp), %eax
    745 	subl	%eax, %ecx
    746 	movl	%ecx, -76(%ebp)
    747 	movswl	-42(%ebp), %eax
    748 	movl	%eax, -92(%ebp)
    749 	movswl	-66(%ebp), %eax
    750 	movl	%eax, -96(%ebp)
    751 	movw	$0, -98(%ebp)
    752 
    753 This appears to be bad because the RA is not folding the store to the stack 
    754 slot into the movl.  The above instructions could be:
    755 	movl    $32, -80(%ebp)
    756 ...
    757 	movl    $32, -84(%ebp)
    758 ...
    759 This seems like a cross between remat and spill folding.
    760 
    761 This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
    762 change, so we could simply subtract %eax from %ecx first and then use %ecx (or
    763 vice-versa).
    764 
    765 //===---------------------------------------------------------------------===//
    766 
    767 This code:
    768 
    769 	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
    770 	br i1 %tmp659, label %cond_true662, label %cond_next715
    771 
    772 produces this:
    773 
    774 	testw	%cx, %cx
    775 	movswl	%cx, %esi
    776 	jns	LBB4_109	# cond_next715
    777 
    778 Shark tells us that using %cx in the testw instruction is sub-optimal. It
    779 suggests using the 32-bit register (which is what ICC uses).
    780 
    781 //===---------------------------------------------------------------------===//
    782 
    783 We compile this:
    784 
    785 void compare (long long foo) {
    786   if (foo < 4294967297LL)
    787     abort();
    788 }
    789 
    790 to:
    791 
    792 compare:
    793         subl    $4, %esp
    794         cmpl    $0, 8(%esp)
    795         setne   %al
    796         movzbw  %al, %ax
    797         cmpl    $1, 12(%esp)
    798         setg    %cl
    799         movzbw  %cl, %cx
    800         cmove   %ax, %cx
    801         testb   $1, %cl
    802         jne     .LBB1_2 # UnifiedReturnBlock
    803 .LBB1_1:        # ifthen
    804         call    abort
    805 .LBB1_2:        # UnifiedReturnBlock
    806         addl    $4, %esp
    807         ret
    808 
    809 (also really horrible code on ppc).  This is due to the expand code for 64-bit
    810 compares.  GCC produces multiple branches, which is much nicer:
    811 
    812 compare:
    813         subl    $12, %esp
    814         movl    20(%esp), %edx
    815         movl    16(%esp), %eax
    816         decl    %edx
    817         jle     .L7
    818 .L5:
    819         addl    $12, %esp
    820         ret
    821         .p2align 4,,7
    822 .L7:
    823         jl      .L4
    824         cmpl    $0, %eax
    825         .p2align 4,,8
    826         ja      .L5
    827 .L4:
    828         .p2align 4,,9
    829         call    abort
    830 
    831 //===---------------------------------------------------------------------===//
    832 
    833 Tail call optimization improvements: Tail call optimization currently
    834 pushes all arguments on the top of the stack (their normal place for
    835 non-tail call optimized calls) that source from the callers arguments
    836 or  that source from a virtual register (also possibly sourcing from
    837 callers arguments).
    838 This is done to prevent overwriting of parameters (see example
    839 below) that might be used later.
    840 
    841 example:  
    842 
    843 int callee(int32, int64); 
    844 int caller(int32 arg1, int32 arg2) { 
    845   int64 local = arg2 * 2; 
    846   return callee(arg2, (int64)local); 
    847 }
    848 
    849 [arg1]          [!arg2 no longer valid since we moved local onto it]
    850 [arg2]      ->  [(int64)
    851 [RETADDR]        local  ]
    852 
    853 Moving arg1 onto the stack slot of callee function would overwrite
    854 arg2 of the caller.
    855 
    856 Possible optimizations:
    857 
    858 
    859  - Analyse the actual parameters of the callee to see which would
    860    overwrite a caller parameter which is used by the callee and only
    861    push them onto the top of the stack.
    862 
    863    int callee (int32 arg1, int32 arg2);
    864    int caller (int32 arg1, int32 arg2) {
    865        return callee(arg1,arg2);
    866    }
    867 
    868    Here we don't need to write any variables to the top of the stack
    869    since they don't overwrite each other.
    870 
    871    int callee (int32 arg1, int32 arg2);
    872    int caller (int32 arg1, int32 arg2) {
    873        return callee(arg2,arg1);
    874    }
    875 
    876    Here we need to push the arguments because they overwrite each
    877    other.
    878 
    879 //===---------------------------------------------------------------------===//
    880 
    881 main ()
    882 {
    883   int i = 0;
    884   unsigned long int z = 0;
    885 
    886   do {
    887     z -= 0x00004000;
    888     i++;
    889     if (i > 0x00040000)
    890       abort ();
    891   } while (z > 0);
    892   exit (0);
    893 }
    894 
    895 gcc compiles this to:
    896 
    897 _main:
    898 	subl	$28, %esp
    899 	xorl	%eax, %eax
    900 	jmp	L2
    901 L3:
    902 	cmpl	$262144, %eax
    903 	je	L10
    904 L2:
    905 	addl	$1, %eax
    906 	cmpl	$262145, %eax
    907 	jne	L3
    908 	call	L_abort$stub
    909 L10:
    910 	movl	$0, (%esp)
    911 	call	L_exit$stub
    912 
    913 llvm:
    914 
    915 _main:
    916 	subl	$12, %esp
    917 	movl	$1, %eax
    918 	movl	$16384, %ecx
    919 LBB1_1:	# bb
    920 	cmpl	$262145, %eax
    921 	jge	LBB1_4	# cond_true
    922 LBB1_2:	# cond_next
    923 	incl	%eax
    924 	addl	$4294950912, %ecx
    925 	cmpl	$16384, %ecx
    926 	jne	LBB1_1	# bb
    927 LBB1_3:	# bb11
    928 	xorl	%eax, %eax
    929 	addl	$12, %esp
    930 	ret
    931 LBB1_4:	# cond_true
    932 	call	L_abort$stub
    933 
    934 1. LSR should rewrite the first cmp with induction variable %ecx.
    935 2. DAG combiner should fold
    936         leal    1(%eax), %edx
    937         cmpl    $262145, %edx
    938    =>
    939         cmpl    $262144, %eax
    940 
    941 //===---------------------------------------------------------------------===//
    942 
    943 define i64 @test(double %X) {
    944 	%Y = fptosi double %X to i64
    945 	ret i64 %Y
    946 }
    947 
    948 compiles to:
    949 
    950 _test:
    951 	subl	$20, %esp
    952 	movsd	24(%esp), %xmm0
    953 	movsd	%xmm0, 8(%esp)
    954 	fldl	8(%esp)
    955 	fisttpll	(%esp)
    956 	movl	4(%esp), %edx
    957 	movl	(%esp), %eax
    958 	addl	$20, %esp
    959 	#FP_REG_KILL
    960 	ret
    961 
    962 This should just fldl directly from the input stack slot.
    963 
    964 //===---------------------------------------------------------------------===//
    965 
    966 This code:
    967 int foo (int x) { return (x & 65535) | 255; }
    968 
    969 Should compile into:
    970 
    971 _foo:
    972         movzwl  4(%esp), %eax
    973         orl     $255, %eax
    974         ret
    975 
    976 instead of:
    977 _foo:
    978 	movl	$65280, %eax
    979 	andl	4(%esp), %eax
    980 	orl	$255, %eax
    981 	ret
    982 
    983 //===---------------------------------------------------------------------===//
    984 
    985 We're codegen'ing multiply of long longs inefficiently:
    986 
    987 unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
    988   return arg1 *  arg2;
    989 }
    990 
    991 We compile to (fomit-frame-pointer):
    992 
    993 _LLM:
    994 	pushl	%esi
    995 	movl	8(%esp), %ecx
    996 	movl	16(%esp), %esi
    997 	movl	%esi, %eax
    998 	mull	%ecx
    999 	imull	12(%esp), %esi
   1000 	addl	%edx, %esi
   1001 	imull	20(%esp), %ecx
   1002 	movl	%esi, %edx
   1003 	addl	%ecx, %edx
   1004 	popl	%esi
   1005 	ret
   1006 
   1007 This looks like a scheduling deficiency and lack of remat of the load from
   1008 the argument area.  ICC apparently produces:
   1009 
   1010         movl      8(%esp), %ecx
   1011         imull     12(%esp), %ecx
   1012         movl      16(%esp), %eax
   1013         imull     4(%esp), %eax 
   1014         addl      %eax, %ecx  
   1015         movl      4(%esp), %eax
   1016         mull      12(%esp) 
   1017         addl      %ecx, %edx
   1018         ret
   1019 
   1020 Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
   1021 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
   1022 
   1023 //===---------------------------------------------------------------------===//
   1024 
   1025 We can fold a store into "zeroing a reg".  Instead of:
   1026 
   1027 xorl    %eax, %eax
   1028 movl    %eax, 124(%esp)
   1029 
   1030 we should get:
   1031 
   1032 movl    $0, 124(%esp)
   1033 
   1034 if the flags of the xor are dead.
   1035 
   1036 Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
   1037 be folded into: shl [mem], 1
   1038 
   1039 //===---------------------------------------------------------------------===//
   1040 
   1041 In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
   1042 or and instruction, for example:
   1043 
   1044 	xorpd	LCPI1_0, %xmm2
   1045 
   1046 However, if xmm2 gets spilled, we end up with really ugly code like this:
   1047 
   1048 	movsd	(%esp), %xmm0
   1049 	xorpd	LCPI1_0, %xmm0
   1050 	movsd	%xmm0, (%esp)
   1051 
   1052 Since we 'know' that this is a 'neg', we can actually "fold" the spill into
   1053 the neg/abs instruction, turning it into an *integer* operation, like this:
   1054 
   1055 	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
   1056 
   1057 you could also use xorb, but xorl is less likely to lead to a partial register
   1058 stall.  Here is a contrived testcase:
   1059 
   1060 double a, b, c;
   1061 void test(double *P) {
   1062   double X = *P;
   1063   a = X;
   1064   bar();
   1065   X = -X;
   1066   b = X;
   1067   bar();
   1068   c = X;
   1069 }
   1070 
   1071 //===---------------------------------------------------------------------===//
   1072 
   1073 The generated code on x86 for checking for signed overflow on a multiply the
   1074 obvious way is much longer than it needs to be.
   1075 
   1076 int x(int a, int b) {
   1077   long long prod = (long long)a*b;
   1078   return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
   1079 }
   1080 
   1081 See PR2053 for more details.
   1082 
   1083 //===---------------------------------------------------------------------===//
   1084 
   1085 We should investigate using cdq/ctld (effect: edx = sar eax, 31)
   1086 more aggressively; it should cost the same as a move+shift on any modern
   1087 processor, but it's a lot shorter. Downside is that it puts more
   1088 pressure on register allocation because it has fixed operands.
   1089 
   1090 Example:
   1091 int abs(int x) {return x < 0 ? -x : x;}
   1092 
   1093 gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
   1094 abs:
   1095         movl    4(%esp), %eax
   1096         cltd
   1097         xorl    %edx, %eax
   1098         subl    %edx, %eax
   1099         ret
   1100 
   1101 //===---------------------------------------------------------------------===//
   1102 
   1103 Take the following code (from 
   1104 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
   1105 
   1106 extern unsigned char first_one[65536];
   1107 int FirstOnet(unsigned long long arg1)
   1108 {
   1109   if (arg1 >> 48)
   1110     return (first_one[arg1 >> 48]);
   1111   return 0;
   1112 }
   1113 
   1114 
   1115 The following code is currently generated:
   1116 FirstOnet:
   1117         movl    8(%esp), %eax
   1118         cmpl    $65536, %eax
   1119         movl    4(%esp), %ecx
   1120         jb      .LBB1_2 # UnifiedReturnBlock
   1121 .LBB1_1:        # ifthen
   1122         shrl    $16, %eax
   1123         movzbl  first_one(%eax), %eax
   1124         ret
   1125 .LBB1_2:        # UnifiedReturnBlock
   1126         xorl    %eax, %eax
   1127         ret
   1128 
   1129 We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
   1130 lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
   1131 
   1132 //===---------------------------------------------------------------------===//
   1133 
   1134 We compile this function:
   1135 
   1136 define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
   1137 entry:
   1138 	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
   1139 	br i1 %tmp2, label %bb7, label %bb
   1140 
   1141 bb:		; preds = %entry
   1142 	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
   1143 	ret i32 %tmp6
   1144 
   1145 bb7:		; preds = %entry
   1146 	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
   1147 	ret i32 %tmp10
   1148 }
   1149 
   1150 to:
   1151 
   1152 foo:                                    # @foo
   1153 # BB#0:                                 # %entry
   1154 	movl	4(%esp), %ecx
   1155 	cmpb	$0, 16(%esp)
   1156 	je	.LBB0_2
   1157 # BB#1:                                 # %bb
   1158 	movl	8(%esp), %eax
   1159 	addl	%ecx, %eax
   1160 	ret
   1161 .LBB0_2:                                # %bb7
   1162 	movl	12(%esp), %edx
   1163 	movl	%ecx, %eax
   1164 	subl	%edx, %eax
   1165 	ret
   1166 
   1167 There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
   1168 couple more movls by putting 4(%esp) into %eax instead of %ecx.
   1169 
   1170 //===---------------------------------------------------------------------===//
   1171 
   1172 See rdar://4653682.
   1173 
   1174 From flops:
   1175 
   1176 LBB1_15:        # bb310
   1177         cvtss2sd        LCPI1_0, %xmm1
   1178         addsd   %xmm1, %xmm0
   1179         movsd   176(%esp), %xmm2
   1180         mulsd   %xmm0, %xmm2
   1181         movapd  %xmm2, %xmm3
   1182         mulsd   %xmm3, %xmm3
   1183         movapd  %xmm3, %xmm4
   1184         mulsd   LCPI1_23, %xmm4
   1185         addsd   LCPI1_24, %xmm4
   1186         mulsd   %xmm3, %xmm4
   1187         addsd   LCPI1_25, %xmm4
   1188         mulsd   %xmm3, %xmm4
   1189         addsd   LCPI1_26, %xmm4
   1190         mulsd   %xmm3, %xmm4
   1191         addsd   LCPI1_27, %xmm4
   1192         mulsd   %xmm3, %xmm4
   1193         addsd   LCPI1_28, %xmm4
   1194         mulsd   %xmm3, %xmm4
   1195         addsd   %xmm1, %xmm4
   1196         mulsd   %xmm2, %xmm4
   1197         movsd   152(%esp), %xmm1
   1198         addsd   %xmm4, %xmm1
   1199         movsd   %xmm1, 152(%esp)
   1200         incl    %eax
   1201         cmpl    %eax, %esi
   1202         jge     LBB1_15 # bb310
   1203 LBB1_16:        # bb358.loopexit
   1204         movsd   152(%esp), %xmm0
   1205         addsd   %xmm0, %xmm0
   1206         addsd   LCPI1_22, %xmm0
   1207         movsd   %xmm0, 152(%esp)
   1208 
   1209 Rather than spilling the result of the last addsd in the loop, we should have
   1210 insert a copy to split the interval (one for the duration of the loop, one
   1211 extending to the fall through). The register pressure in the loop isn't high
   1212 enough to warrant the spill.
   1213 
   1214 Also check why xmm7 is not used at all in the function.
   1215 
   1216 //===---------------------------------------------------------------------===//
   1217 
   1218 Take the following:
   1219 
   1220 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
   1221 target triple = "i386-apple-darwin8"
   1222 @in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
   1223 define fastcc void @abort_gzip() noreturn nounwind  {
   1224 entry:
   1225 	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
   1226 	br i1 %tmp.b.i, label %bb.i, label %bb4.i
   1227 bb.i:		; preds = %entry
   1228 	tail call void @exit( i32 1 ) noreturn nounwind 
   1229 	unreachable
   1230 bb4.i:		; preds = %entry
   1231 	store i1 true, i1* @in_exit.4870.b
   1232 	tail call void @exit( i32 1 ) noreturn nounwind 
   1233 	unreachable
   1234 }
   1235 declare void @exit(i32) noreturn nounwind 
   1236 
   1237 This compiles into:
   1238 _abort_gzip:                            ## @abort_gzip
   1239 ## BB#0:                                ## %entry
   1240 	subl	$12, %esp
   1241 	movb	_in_exit.4870.b, %al
   1242 	cmpb	$1, %al
   1243 	jne	LBB0_2
   1244 
   1245 We somehow miss folding the movb into the cmpb.
   1246 
   1247 //===---------------------------------------------------------------------===//
   1248 
   1249 We compile:
   1250 
   1251 int test(int x, int y) {
   1252   return x-y-1;
   1253 }
   1254 
   1255 into (-m64):
   1256 
   1257 _test:
   1258 	decl	%edi
   1259 	movl	%edi, %eax
   1260 	subl	%esi, %eax
   1261 	ret
   1262 
   1263 it would be better to codegen as: x+~y  (notl+addl)
   1264 
   1265 //===---------------------------------------------------------------------===//
   1266 
   1267 This code:
   1268 
   1269 int foo(const char *str,...)
   1270 {
   1271  __builtin_va_list a; int x;
   1272  __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
   1273  return x;
   1274 }
   1275 
   1276 gets compiled into this on x86-64:
   1277 	subq    $200, %rsp
   1278         movaps  %xmm7, 160(%rsp)
   1279         movaps  %xmm6, 144(%rsp)
   1280         movaps  %xmm5, 128(%rsp)
   1281         movaps  %xmm4, 112(%rsp)
   1282         movaps  %xmm3, 96(%rsp)
   1283         movaps  %xmm2, 80(%rsp)
   1284         movaps  %xmm1, 64(%rsp)
   1285         movaps  %xmm0, 48(%rsp)
   1286         movq    %r9, 40(%rsp)
   1287         movq    %r8, 32(%rsp)
   1288         movq    %rcx, 24(%rsp)
   1289         movq    %rdx, 16(%rsp)
   1290         movq    %rsi, 8(%rsp)
   1291         leaq    (%rsp), %rax
   1292         movq    %rax, 192(%rsp)
   1293         leaq    208(%rsp), %rax
   1294         movq    %rax, 184(%rsp)
   1295         movl    $48, 180(%rsp)
   1296         movl    $8, 176(%rsp)
   1297         movl    176(%rsp), %eax
   1298         cmpl    $47, %eax
   1299         jbe     .LBB1_3 # bb
   1300 .LBB1_1:        # bb3
   1301         movq    184(%rsp), %rcx
   1302         leaq    8(%rcx), %rax
   1303         movq    %rax, 184(%rsp)
   1304 .LBB1_2:        # bb4
   1305         movl    (%rcx), %eax
   1306         addq    $200, %rsp
   1307         ret
   1308 .LBB1_3:        # bb
   1309         movl    %eax, %ecx
   1310         addl    $8, %eax
   1311         addq    192(%rsp), %rcx
   1312         movl    %eax, 176(%rsp)
   1313         jmp     .LBB1_2 # bb4
   1314 
   1315 gcc 4.3 generates:
   1316 	subq    $96, %rsp
   1317 .LCFI0:
   1318         leaq    104(%rsp), %rax
   1319         movq    %rsi, -80(%rsp)
   1320         movl    $8, -120(%rsp)
   1321         movq    %rax, -112(%rsp)
   1322         leaq    -88(%rsp), %rax
   1323         movq    %rax, -104(%rsp)
   1324         movl    $8, %eax
   1325         cmpl    $48, %eax
   1326         jb      .L6
   1327         movq    -112(%rsp), %rdx
   1328         movl    (%rdx), %eax
   1329         addq    $96, %rsp
   1330         ret
   1331         .p2align 4,,10
   1332         .p2align 3
   1333 .L6:
   1334         mov     %eax, %edx
   1335         addq    -104(%rsp), %rdx
   1336         addl    $8, %eax
   1337         movl    %eax, -120(%rsp)
   1338         movl    (%rdx), %eax
   1339         addq    $96, %rsp
   1340         ret
   1341 
   1342 and it gets compiled into this on x86:
   1343 	pushl   %ebp
   1344         movl    %esp, %ebp
   1345         subl    $4, %esp
   1346         leal    12(%ebp), %eax
   1347         movl    %eax, -4(%ebp)
   1348         leal    16(%ebp), %eax
   1349         movl    %eax, -4(%ebp)
   1350         movl    12(%ebp), %eax
   1351         addl    $4, %esp
   1352         popl    %ebp
   1353         ret
   1354 
   1355 gcc 4.3 generates:
   1356 	pushl   %ebp
   1357         movl    %esp, %ebp
   1358         movl    12(%ebp), %eax
   1359         popl    %ebp
   1360         ret
   1361 
   1362 //===---------------------------------------------------------------------===//
   1363 
   1364 Teach tblgen not to check bitconvert source type in some cases. This allows us
   1365 to consolidate the following patterns in X86InstrMMX.td:
   1366 
   1367 def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1368                                                   (iPTR 0))))),
   1369           (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
   1370 def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1371                                                   (iPTR 0))))),
   1372           (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
   1373 def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
   1374                                                   (iPTR 0))))),
   1375           (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
   1376 
   1377 There are other cases in various td files.
   1378 
   1379 //===---------------------------------------------------------------------===//
   1380 
   1381 Take something like the following on x86-32:
   1382 unsigned a(unsigned long long x, unsigned y) {return x % y;}
   1383 
   1384 We currently generate a libcall, but we really shouldn't: the expansion is
   1385 shorter and likely faster than the libcall.  The expected code is something
   1386 like the following:
   1387 
   1388 	movl	12(%ebp), %eax
   1389 	movl	16(%ebp), %ecx
   1390 	xorl	%edx, %edx
   1391 	divl	%ecx
   1392 	movl	8(%ebp), %eax
   1393 	divl	%ecx
   1394 	movl	%edx, %eax
   1395 	ret
   1396 
   1397 A similar code sequence works for division.
   1398 
   1399 //===---------------------------------------------------------------------===//
   1400 
   1401 These should compile to the same code, but the later codegen's to useless
   1402 instructions on X86. This may be a trivial dag combine (GCC PR7061):
   1403 
   1404 struct s1 { unsigned char a, b; };
   1405 unsigned long f1(struct s1 x) {
   1406     return x.a + x.b;
   1407 }
   1408 struct s2 { unsigned a: 8, b: 8; };
   1409 unsigned long f2(struct s2 x) {
   1410     return x.a + x.b;
   1411 }
   1412 
   1413 //===---------------------------------------------------------------------===//
   1414 
   1415 We currently compile this:
   1416 
   1417 define i32 @func1(i32 %v1, i32 %v2) nounwind {
   1418 entry:
   1419   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   1420   %sum = extractvalue {i32, i1} %t, 0
   1421   %obit = extractvalue {i32, i1} %t, 1
   1422   br i1 %obit, label %overflow, label %normal
   1423 normal:
   1424   ret i32 %sum
   1425 overflow:
   1426   call void @llvm.trap()
   1427   unreachable
   1428 }
   1429 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
   1430 declare void @llvm.trap()
   1431 
   1432 to:
   1433 
   1434 _func1:
   1435 	movl	4(%esp), %eax
   1436 	addl	8(%esp), %eax
   1437 	jo	LBB1_2	## overflow
   1438 LBB1_1:	## normal
   1439 	ret
   1440 LBB1_2:	## overflow
   1441 	ud2
   1442 
   1443 it would be nice to produce "into" someday.
   1444 
   1445 //===---------------------------------------------------------------------===//
   1446 
   1447 This code:
   1448 
   1449 void vec_mpys1(int y[], const int x[], int scaler) {
   1450 int i;
   1451 for (i = 0; i < 150; i++)
   1452  y[i] += (((long long)scaler * (long long)x[i]) >> 31);
   1453 }
   1454 
   1455 Compiles to this loop with GCC 3.x:
   1456 
   1457 .L5:
   1458 	movl	%ebx, %eax
   1459 	imull	(%edi,%ecx,4)
   1460 	shrdl	$31, %edx, %eax
   1461 	addl	%eax, (%esi,%ecx,4)
   1462 	incl	%ecx
   1463 	cmpl	$149, %ecx
   1464 	jle	.L5
   1465 
   1466 llvm-gcc compiles it to the much uglier:
   1467 
   1468 LBB1_1:	## bb1
   1469 	movl	24(%esp), %eax
   1470 	movl	(%eax,%edi,4), %ebx
   1471 	movl	%ebx, %ebp
   1472 	imull	%esi, %ebp
   1473 	movl	%ebx, %eax
   1474 	mull	%ecx
   1475 	addl	%ebp, %edx
   1476 	sarl	$31, %ebx
   1477 	imull	%ecx, %ebx
   1478 	addl	%edx, %ebx
   1479 	shldl	$1, %eax, %ebx
   1480 	movl	20(%esp), %eax
   1481 	addl	%ebx, (%eax,%edi,4)
   1482 	incl	%edi
   1483 	cmpl	$150, %edi
   1484 	jne	LBB1_1	## bb1
   1485 
   1486 The issue is that we hoist the cast of "scaler" to long long outside of the
   1487 loop, the value comes into the loop as two values, and
   1488 RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
   1489 constructed BUILD_PAIR which represents the cast value.
   1490 
   1491 This can be handled by making CodeGenPrepare sink the cast.
   1492 
   1493 //===---------------------------------------------------------------------===//
   1494 
   1495 Test instructions can be eliminated by using EFLAGS values from arithmetic
   1496 instructions. This is currently not done for mul, and, or, xor, neg, shl,
   1497 sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
   1498 for read-modify-write instructions. It is also current not done if the
   1499 OF or CF flags are needed.
   1500 
   1501 The shift operators have the complication that when the shift count is
   1502 zero, EFLAGS is not set, so they can only subsume a test instruction if
   1503 the shift count is known to be non-zero. Also, using the EFLAGS value
   1504 from a shift is apparently very slow on some x86 implementations.
   1505 
   1506 In read-modify-write instructions, the root node in the isel match is
   1507 the store, and isel has no way for the use of the EFLAGS result of the
   1508 arithmetic to be remapped to the new node.
   1509 
   1510 Add and subtract instructions set OF on signed overflow and CF on unsiged
   1511 overflow, while test instructions always clear OF and CF. In order to
   1512 replace a test with an add or subtract in a situation where OF or CF is
   1513 needed, codegen must be able to prove that the operation cannot see
   1514 signed or unsigned overflow, respectively.
   1515 
   1516 //===---------------------------------------------------------------------===//
   1517 
   1518 memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
   1519 define <16 x float> @foo(<16 x float> %A) nounwind {
   1520 	%tmp = alloca <16 x float>, align 16
   1521 	%tmp2 = alloca <16 x float>, align 16
   1522 	store <16 x float> %A, <16 x float>* %tmp
   1523 	%s = bitcast <16 x float>* %tmp to i8*
   1524 	%s2 = bitcast <16 x float>* %tmp2 to i8*
   1525 	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
   1526 	%R = load <16 x float>* %tmp2
   1527 	ret <16 x float> %R
   1528 }
   1529 
   1530 declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
   1531 
   1532 which compiles to:
   1533 
   1534 _foo:
   1535 	subl	$140, %esp
   1536 	movaps	%xmm3, 112(%esp)
   1537 	movaps	%xmm2, 96(%esp)
   1538 	movaps	%xmm1, 80(%esp)
   1539 	movaps	%xmm0, 64(%esp)
   1540 	movl	60(%esp), %eax
   1541 	movl	%eax, 124(%esp)
   1542 	movl	56(%esp), %eax
   1543 	movl	%eax, 120(%esp)
   1544 	movl	52(%esp), %eax
   1545         <many many more 32-bit copies>
   1546       	movaps	(%esp), %xmm0
   1547 	movaps	16(%esp), %xmm1
   1548 	movaps	32(%esp), %xmm2
   1549 	movaps	48(%esp), %xmm3
   1550 	addl	$140, %esp
   1551 	ret
   1552 
   1553 On Nehalem, it may even be cheaper to just use movups when unaligned than to
   1554 fall back to lower-granularity chunks.
   1555 
   1556 //===---------------------------------------------------------------------===//
   1557 
   1558 Implement processor-specific optimizations for parity with GCC on these
   1559 processors.  GCC does two optimizations:
   1560 
   1561 1. ix86_pad_returns inserts a noop before ret instructions if immediately
   1562    preceded by a conditional branch or is the target of a jump.
   1563 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
   1564    code contains more than 3 branches.
   1565    
   1566 The first one is done for all AMDs, Core2, and "Generic"
   1567 The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
   1568   Core 2, and "Generic"
   1569 
   1570 //===---------------------------------------------------------------------===//
   1571 
   1572 Testcase:
   1573 int a(int x) { return (x & 127) > 31; }
   1574 
   1575 Current output:
   1576 	movl	4(%esp), %eax
   1577 	andl	$127, %eax
   1578 	cmpl	$31, %eax
   1579 	seta	%al
   1580 	movzbl	%al, %eax
   1581 	ret
   1582 
   1583 Ideal output:
   1584 	xorl	%eax, %eax
   1585 	testl	$96, 4(%esp)
   1586 	setne	%al
   1587 	ret
   1588 
   1589 This should definitely be done in instcombine, canonicalizing the range
   1590 condition into a != condition.  We get this IR:
   1591 
   1592 define i32 @a(i32 %x) nounwind readnone {
   1593 entry:
   1594 	%0 = and i32 %x, 127		; <i32> [#uses=1]
   1595 	%1 = icmp ugt i32 %0, 31		; <i1> [#uses=1]
   1596 	%2 = zext i1 %1 to i32		; <i32> [#uses=1]
   1597 	ret i32 %2
   1598 }
   1599 
   1600 Instcombine prefers to strength reduce relational comparisons to equality
   1601 comparisons when possible, this should be another case of that.  This could
   1602 be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
   1603 looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
   1604 be redesigned to use ComputeMaskedBits and friends.
   1605 
   1606 
   1607 //===---------------------------------------------------------------------===//
   1608 Testcase:
   1609 int x(int a) { return (a&0xf0)>>4; }
   1610 
   1611 Current output:
   1612 	movl	4(%esp), %eax
   1613 	shrl	$4, %eax
   1614 	andl	$15, %eax
   1615 	ret
   1616 
   1617 Ideal output:
   1618 	movzbl	4(%esp), %eax
   1619 	shrl	$4, %eax
   1620 	ret
   1621 
   1622 //===---------------------------------------------------------------------===//
   1623 
   1624 Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
   1625 properly.
   1626 
   1627 When the return value is not used (i.e. only care about the value in the
   1628 memory), x86 does not have to use add to implement these. Instead, it can use
   1629 add, sub, inc, dec instructions with the "lock" prefix.
   1630 
   1631 This is currently implemented using a bit of instruction selection trick. The
   1632 issue is the target independent pattern produces one output and a chain and we
   1633 want to map it into one that just output a chain. The current trick is to select
   1634 it into a MERGE_VALUES with the first definition being an implicit_def. The
   1635 proper solution is to add new ISD opcodes for the no-output variant. DAG
   1636 combiner can then transform the node before it gets to target node selection.
   1637 
   1638 Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
   1639 fact these instructions are identical to the non-lock versions. We need a way to
   1640 add target specific information to target nodes and have this information
   1641 carried over to machine instructions. Asm printer (or JIT) can use this
   1642 information to add the "lock" prefix.
   1643 
   1644 //===---------------------------------------------------------------------===//
   1645 
   1646 struct B {
   1647   unsigned char y0 : 1;
   1648 };
   1649 
   1650 int bar(struct B* a) { return a->y0; }
   1651 
   1652 define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
   1653   %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
   1654   %2 = load i8* %1, align 1
   1655   %3 = and i8 %2, 1
   1656   %4 = zext i8 %3 to i32
   1657   ret i32 %4
   1658 }
   1659 
   1660 bar:                                    # @bar
   1661 # BB#0:
   1662         movb    (%rdi), %al
   1663         andb    $1, %al
   1664         movzbl  %al, %eax
   1665         ret
   1666 
   1667 Missed optimization: should be movl+andl.
   1668 
   1669 //===---------------------------------------------------------------------===//
   1670 
   1671 The x86_64 abi says:
   1672 
   1673 Booleans, when stored in a memory object, are stored as single byte objects the
   1674 value of which is always 0 (false) or 1 (true).
   1675 
   1676 We are not using this fact:
   1677 
   1678 int bar(_Bool *a) { return *a; }
   1679 
   1680 define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
   1681   %1 = load i8* %a, align 1, !tbaa !0
   1682   %tmp = and i8 %1, 1
   1683   %2 = zext i8 %tmp to i32
   1684   ret i32 %2
   1685 }
   1686 
   1687 bar:
   1688         movb    (%rdi), %al
   1689         andb    $1, %al
   1690         movzbl  %al, %eax
   1691         ret
   1692 
   1693 GCC produces
   1694 
   1695 bar:
   1696         movzbl  (%rdi), %eax
   1697         ret
   1698 
   1699 //===---------------------------------------------------------------------===//
   1700 
   1701 Consider the following two functions compiled with clang:
   1702 _Bool foo(int *x) { return !(*x & 4); }
   1703 unsigned bar(int *x) { return !(*x & 4); }
   1704 
   1705 foo:
   1706 	movl	4(%esp), %eax
   1707 	testb	$4, (%eax)
   1708 	sete	%al
   1709 	movzbl	%al, %eax
   1710 	ret
   1711 
   1712 bar:
   1713 	movl	4(%esp), %eax
   1714 	movl	(%eax), %eax
   1715 	shrl	$2, %eax
   1716 	andl	$1, %eax
   1717 	xorl	$1, %eax
   1718 	ret
   1719 
   1720 The second function generates more code even though the two functions are
   1721 are functionally identical.
   1722 
   1723 //===---------------------------------------------------------------------===//
   1724 
   1725 Take the following C code:
   1726 int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
   1727 
   1728 We generate the following IR with clang:
   1729 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1730 entry:
   1731   %tmp = xor i32 %b, %a                           ; <i32> [#uses=1]
   1732   %tmp6 = and i32 %tmp, 255                       ; <i32> [#uses=1]
   1733   %cmp = icmp eq i32 %tmp6, 0                     ; <i1> [#uses=1]
   1734   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1735   ret i32 %conv5
   1736 }
   1737 
   1738 And the following x86 code:
   1739 	xorl	%esi, %edi
   1740 	testb	$-1, %dil
   1741 	sete	%al
   1742 	movzbl	%al, %eax
   1743 	ret
   1744 
   1745 A cmpb instead of the xorl+testb would be one instruction shorter.
   1746 
   1747 //===---------------------------------------------------------------------===//
   1748 
   1749 Given the following C code:
   1750 int f(int a, int b) { return (signed char)a == (signed char)b; }
   1751 
   1752 We generate the following IR with clang:
   1753 define i32 @f(i32 %a, i32 %b) nounwind readnone {
   1754 entry:
   1755   %sext = shl i32 %a, 24                          ; <i32> [#uses=1]
   1756   %conv1 = ashr i32 %sext, 24                     ; <i32> [#uses=1]
   1757   %sext6 = shl i32 %b, 24                         ; <i32> [#uses=1]
   1758   %conv4 = ashr i32 %sext6, 24                    ; <i32> [#uses=1]
   1759   %cmp = icmp eq i32 %conv1, %conv4               ; <i1> [#uses=1]
   1760   %conv5 = zext i1 %cmp to i32                    ; <i32> [#uses=1]
   1761   ret i32 %conv5
   1762 }
   1763 
   1764 And the following x86 code:
   1765 	movsbl	%sil, %eax
   1766 	movsbl	%dil, %ecx
   1767 	cmpl	%eax, %ecx
   1768 	sete	%al
   1769 	movzbl	%al, %eax
   1770 	ret
   1771 
   1772 
   1773 It should be possible to eliminate the sign extensions.
   1774 
   1775 //===---------------------------------------------------------------------===//
   1776 
   1777 LLVM misses a load+store narrowing opportunity in this code:
   1778 
   1779 %struct.bf = type { i64, i16, i16, i32 }
   1780 
   1781 @bfi = external global %struct.bf*                ; <%struct.bf**> [#uses=2]
   1782 
   1783 define void @t1() nounwind ssp {
   1784 entry:
   1785   %0 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1786   %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
   1787   %2 = bitcast i16* %1 to i32*                    ; <i32*> [#uses=2]
   1788   %3 = load i32* %2, align 1                      ; <i32> [#uses=1]
   1789   %4 = and i32 %3, -65537                         ; <i32> [#uses=1]
   1790   store i32 %4, i32* %2, align 1
   1791   %5 = load %struct.bf** @bfi, align 8            ; <%struct.bf*> [#uses=1]
   1792   %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
   1793   %7 = bitcast i16* %6 to i32*                    ; <i32*> [#uses=2]
   1794   %8 = load i32* %7, align 1                      ; <i32> [#uses=1]
   1795   %9 = and i32 %8, -131073                        ; <i32> [#uses=1]
   1796   store i32 %9, i32* %7, align 1
   1797   ret void
   1798 }
   1799 
   1800 LLVM currently emits this:
   1801 
   1802   movq  bfi(%rip), %rax
   1803   andl  $-65537, 8(%rax)
   1804   movq  bfi(%rip), %rax
   1805   andl  $-131073, 8(%rax)
   1806   ret
   1807 
   1808 It could narrow the loads and stores to emit this:
   1809 
   1810   movq  bfi(%rip), %rax
   1811   andb  $-2, 10(%rax)
   1812   movq  bfi(%rip), %rax
   1813   andb  $-3, 10(%rax)
   1814   ret
   1815 
   1816 The trouble is that there is a TokenFactor between the store and the
   1817 load, making it non-trivial to determine if there's anything between
   1818 the load and the store which would prohibit narrowing.
   1819 
   1820 //===---------------------------------------------------------------------===//
   1821 
   1822 This code:
   1823 void foo(unsigned x) {
   1824   if (x == 0) bar();
   1825   else if (x == 1) qux();
   1826 }
   1827 
   1828 currently compiles into:
   1829 _foo:
   1830 	movl	4(%esp), %eax
   1831 	cmpl	$1, %eax
   1832 	je	LBB0_3
   1833 	testl	%eax, %eax
   1834 	jne	LBB0_4
   1835 
   1836 the testl could be removed:
   1837 _foo:
   1838 	movl	4(%esp), %eax
   1839 	cmpl	$1, %eax
   1840 	je	LBB0_3
   1841 	jb	LBB0_4
   1842 
   1843 0 is the only unsigned number < 1.
   1844 
   1845 //===---------------------------------------------------------------------===//
   1846 
   1847 This code:
   1848 
   1849 %0 = type { i32, i1 }
   1850 
   1851 define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
   1852 entry:
   1853   %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
   1854   %cmp = extractvalue %0 %uadd, 1
   1855   %inc = zext i1 %cmp to i32
   1856   %add = add i32 %x, %sum
   1857   %z.0 = add i32 %add, %inc
   1858   ret i32 %z.0
   1859 }
   1860 
   1861 declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
   1862 
   1863 compiles to:
   1864 
   1865 _add32carry:                            ## @add32carry
   1866 	addl	%esi, %edi
   1867 	sbbl	%ecx, %ecx
   1868 	movl	%edi, %eax
   1869 	subl	%ecx, %eax
   1870 	ret
   1871 
   1872 But it could be:
   1873 
   1874 _add32carry:
   1875 	leal	(%rsi,%rdi), %eax
   1876 	cmpl	%esi, %eax
   1877 	adcl	$0, %eax
   1878 	ret
   1879 
   1880 //===---------------------------------------------------------------------===//
   1881 
   1882 The hot loop of 256.bzip2 contains code that looks a bit like this:
   1883 
   1884 int foo(char *P, char *Q, int x, int y) {
   1885   if (P[0] != Q[0])
   1886      return P[0] < Q[0];
   1887   if (P[1] != Q[1])
   1888      return P[1] < Q[1];
   1889   if (P[2] != Q[2])
   1890      return P[2] < Q[2];
   1891    return P[3] < Q[3];
   1892 }
   1893 
   1894 In the real code, we get a lot more wrong than this.  However, even in this
   1895 code we generate:
   1896 
   1897 _foo:                                   ## @foo
   1898 ## BB#0:                                ## %entry
   1899 	movb	(%rsi), %al
   1900 	movb	(%rdi), %cl
   1901 	cmpb	%al, %cl
   1902 	je	LBB0_2
   1903 LBB0_1:                                 ## %if.then
   1904 	cmpb	%al, %cl
   1905 	jmp	LBB0_5
   1906 LBB0_2:                                 ## %if.end
   1907 	movb	1(%rsi), %al
   1908 	movb	1(%rdi), %cl
   1909 	cmpb	%al, %cl
   1910 	jne	LBB0_1
   1911 ## BB#3:                                ## %if.end38
   1912 	movb	2(%rsi), %al
   1913 	movb	2(%rdi), %cl
   1914 	cmpb	%al, %cl
   1915 	jne	LBB0_1
   1916 ## BB#4:                                ## %if.end60
   1917 	movb	3(%rdi), %al
   1918 	cmpb	3(%rsi), %al
   1919 LBB0_5:                                 ## %if.end60
   1920 	setl	%al
   1921 	movzbl	%al, %eax
   1922 	ret
   1923 
   1924 Note that we generate jumps to LBB0_1 which does a redundant compare.  The
   1925 redundant compare also forces the register values to be live, which prevents
   1926 folding one of the loads into the compare.  In contrast, GCC 4.2 produces:
   1927 
   1928 _foo:
   1929 	movzbl	(%rsi), %eax
   1930 	cmpb	%al, (%rdi)
   1931 	jne	L10
   1932 L12:
   1933 	movzbl	1(%rsi), %eax
   1934 	cmpb	%al, 1(%rdi)
   1935 	jne	L10
   1936 	movzbl	2(%rsi), %eax
   1937 	cmpb	%al, 2(%rdi)
   1938 	jne	L10
   1939 	movzbl	3(%rdi), %eax
   1940 	cmpb	3(%rsi), %al
   1941 L10:
   1942 	setl	%al
   1943 	movzbl	%al, %eax
   1944 	ret
   1945 
   1946 which is "perfect".
   1947 
   1948 //===---------------------------------------------------------------------===//
   1949 
   1950 For the branch in the following code:
   1951 int a();
   1952 int b(int x, int y) {
   1953   if (x & (1<<(y&7)))
   1954     return a();
   1955   return y;
   1956 }
   1957 
   1958 We currently generate:
   1959 	movb	%sil, %al
   1960 	andb	$7, %al
   1961 	movzbl	%al, %eax
   1962 	btl	%eax, %edi
   1963 	jae	.LBB0_2
   1964 
   1965 movl+andl would be shorter than the movb+andb+movzbl sequence.
   1966 
   1967 //===---------------------------------------------------------------------===//
   1968 
   1969 For the following:
   1970 struct u1 {
   1971     float x, y;
   1972 };
   1973 float foo(struct u1 u) {
   1974     return u.x + u.y;
   1975 }
   1976 
   1977 We currently generate:
   1978 	movdqa	%xmm0, %xmm1
   1979 	pshufd	$1, %xmm0, %xmm0        # xmm0 = xmm0[1,0,0,0]
   1980 	addss	%xmm1, %xmm0
   1981 	ret
   1982 
   1983 We could save an instruction here by commuting the addss.
   1984 
   1985 //===---------------------------------------------------------------------===//
   1986 
   1987 This (from PR9661):
   1988 
   1989 float clamp_float(float a) {
   1990         if (a > 1.0f)
   1991                 return 1.0f;
   1992         else if (a < 0.0f)
   1993                 return 0.0f;
   1994         else
   1995                 return a;
   1996 }
   1997 
   1998 Could compile to:
   1999 
   2000 clamp_float:                            # @clamp_float
   2001         movss   .LCPI0_0(%rip), %xmm1
   2002         minss   %xmm1, %xmm0
   2003         pxor    %xmm1, %xmm1
   2004         maxss   %xmm1, %xmm0
   2005         ret
   2006 
   2007 with -ffast-math.
   2008 
   2009 //===---------------------------------------------------------------------===//
   2010 
   2011 This function (from PR9803):
   2012 
   2013 int clamp2(int a) {
   2014         if (a > 5)
   2015                 a = 5;
   2016         if (a < 0) 
   2017                 return 0;
   2018         return a;
   2019 }
   2020 
   2021 Compiles to:
   2022 
   2023 _clamp2:                                ## @clamp2
   2024         pushq   %rbp
   2025         movq    %rsp, %rbp
   2026         cmpl    $5, %edi
   2027         movl    $5, %ecx
   2028         cmovlel %edi, %ecx
   2029         testl   %ecx, %ecx
   2030         movl    $0, %eax
   2031         cmovnsl %ecx, %eax
   2032         popq    %rbp
   2033         ret
   2034 
   2035 The move of 0 could be scheduled above the test to make it is xor reg,reg.
   2036 
   2037 //===---------------------------------------------------------------------===//
   2038 
   2039 GCC PR48986.  We currently compile this:
   2040 
   2041 void bar(void);
   2042 void yyy(int* p) {
   2043     if (__sync_fetch_and_add(p, -1) == 1)
   2044       bar();
   2045 }
   2046 
   2047 into:
   2048 	movl	$-1, %eax
   2049 	lock
   2050 	xaddl	%eax, (%rdi)
   2051 	cmpl	$1, %eax
   2052 	je	LBB0_2
   2053 
   2054 Instead we could generate:
   2055 
   2056 	lock
   2057 	dec %rdi
   2058 	je LBB0_2
   2059 
   2060 The trick is to match "fetch_and_add(X, -C) == C".
   2061 
   2062 //===---------------------------------------------------------------------===//
   2063 
   2064 unsigned log2(unsigned x) {
   2065   return x > 1 ? 32-__builtin_clz(x-1) : 0;
   2066 }
   2067 
   2068 generates (x86_64):
   2069 	xorl	%eax, %eax
   2070 	cmpl	$2, %edi
   2071 	jb	LBB0_2
   2072 ## BB#1:
   2073 	decl	%edi
   2074 	movl	$63, %ecx
   2075 	bsrl	%edi, %eax
   2076 	cmovel	%ecx, %eax
   2077 	xorl	$-32, %eax
   2078 	addl	$33, %eax
   2079 LBB0_2:
   2080 	ret
   2081 
   2082 The cmov and the early test are redundant:
   2083 	xorl	%eax, %eax
   2084 	cmpl	$2, %edi
   2085 	jb	LBB0_2
   2086 ## BB#1:
   2087 	decl	%edi
   2088 	bsrl	%edi, %eax
   2089 	xorl	$-32, %eax
   2090 	addl	$33, %eax
   2091 LBB0_2:
   2092 	ret
   2093 
   2094 //===---------------------------------------------------------------------===//
   2095