Home | History | Annotate | only in /external/llvm/lib/Target/PowerPC
Up to higher level directory
NameDateSize
CMakeLists.txt28-Mar-20121.1K
InstPrinter/28-Mar-2012
Makefile28-Mar-2012785
MCTargetDesc/28-Mar-2012
PPC.h28-Mar-20122.5K
PPC.td28-Mar-20125.4K
PPCAsmPrinter.cpp28-Mar-201225.4K
PPCBranchSelector.cpp28-Mar-20125.8K
PPCCallingConv.td28-Mar-20125.2K
PPCCodeEmitter.cpp28-Mar-20129.7K
PPCFrameLowering.cpp28-Mar-201234.6K
PPCFrameLowering.h28-Mar-20128.9K
PPCHazardRecognizers.cpp28-Mar-201210.2K
PPCHazardRecognizers.h28-Mar-20123.1K
PPCInstr64Bit.td28-Mar-201232.5K
PPCInstrAltivec.td28-Mar-201233.9K
PPCInstrBuilder.h28-Mar-20121.4K
PPCInstrFormats.td28-Mar-201223.3K
PPCInstrInfo.cpp28-Mar-201225.4K
PPCInstrInfo.h28-Mar-20126.1K
PPCInstrInfo.td28-Mar-201265.2K
PPCISelDAGToDAG.cpp28-Mar-201242.1K
PPCISelLowering.cpp28-Mar-2012233.5K
PPCISelLowering.h28-Mar-201222K
PPCJITInfo.cpp28-Mar-201217.6K
PPCJITInfo.h28-Mar-20121.6K
PPCMachineFunctionInfo.h28-Mar-20125K
PPCMCInstLower.cpp28-Mar-20126.1K
PPCPerfectShuffle.h28-Mar-2012397.4K
PPCRegisterInfo.cpp28-Mar-201223.1K
PPCRegisterInfo.h28-Mar-20122.3K
PPCRegisterInfo.td28-Mar-201213K
PPCRelocations.h28-Mar-20121.9K
PPCSchedule.td28-Mar-201214.3K
PPCSchedule440.td28-Mar-201230.2K
PPCScheduleG3.td28-Mar-20123.1K
PPCScheduleG4.td28-Mar-20123.8K
PPCScheduleG4Plus.td28-Mar-20124.1K
PPCScheduleG5.td28-Mar-20124.4K
PPCSelectionDAGInfo.cpp28-Mar-2012737
PPCSelectionDAGInfo.h28-Mar-2012830
PPCSubtarget.cpp28-Mar-20124.6K
PPCSubtarget.h28-Mar-20125K
PPCTargetMachine.cpp28-Mar-20123.5K
PPCTargetMachine.h28-Mar-20123.2K
README.txt28-Mar-201224.1K
README_ALTIVEC.txt28-Mar-20126.2K
TargetInfo/28-Mar-2012

README.txt

      1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
      2 
      3 TODO:
      4 * gpr0 allocation
      5 * implement do-loop -> bdnz transform
      6 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
      7 
      8 ===-------------------------------------------------------------------------===
      9 
     10 On PPC64, this:
     11 
     12 long f2 (long x) { return 0xfffffff000000000UL; }
     13 long f3 (long x) { return 0x1ffffffffUL; }
     14 
     15 could compile into:
     16 
     17 _f2:
     18 	li r3,-1
     19 	rldicr r3,r3,0,27
     20 	blr
     21 _f3:
     22 	li r3,-1
     23 	rldicl r3,r3,0,31
     24 	blr
     25 
     26 we produce:
     27 
     28 _f2:
     29 	lis r2, 4095
     30 	ori r2, r2, 65535
     31 	sldi r3, r2, 36
     32 	blr 
     33 _f3:
     34 	li r2, 1
     35 	sldi r2, r2, 32
     36 	oris r2, r2, 65535
     37 	ori r3, r2, 65535
     38 	blr 
     39 
     40 ===-------------------------------------------------------------------------===
     41 
     42 This code:
     43 
     44 unsigned add32carry(unsigned sum, unsigned x) {
     45  unsigned z = sum + x;
     46  if (sum + x < x)
     47      z++;
     48  return z;
     49 }
     50 
     51 Should compile to something like:
     52 
     53 	addc r3,r3,r4
     54 	addze r3,r3
     55 
     56 instead we get:
     57 
     58 	add r3, r4, r3
     59 	cmplw cr7, r3, r4
     60 	mfcr r4 ; 1
     61 	rlwinm r4, r4, 29, 31, 31
     62 	add r3, r3, r4
     63 
     64 Ick.
     65 
     66 ===-------------------------------------------------------------------------===
     67 
     68 Support 'update' load/store instructions.  These are cracked on the G5, but are
     69 still a codesize win.
     70 
     71 With preinc enabled, this:
     72 
     73 long *%test4(long *%X, long *%dest) {
     74         %Y = getelementptr long* %X, int 4
     75         %A = load long* %Y
     76         store long %A, long* %dest
     77         ret long* %Y
     78 }
     79 
     80 compiles to:
     81 
     82 _test4:
     83         mr r2, r3
     84         lwzu r5, 32(r2)
     85         lwz r3, 36(r3)
     86         stw r5, 0(r4)
     87         stw r3, 4(r4)
     88         mr r3, r2
     89         blr 
     90 
     91 with -sched=list-burr, I get:
     92 
     93 _test4:
     94         lwz r2, 36(r3)
     95         lwzu r5, 32(r3)
     96         stw r2, 4(r4)
     97         stw r5, 0(r4)
     98         blr 
     99 
    100 ===-------------------------------------------------------------------------===
    101 
    102 We compile the hottest inner loop of viterbi to:
    103 
    104         li r6, 0
    105         b LBB1_84       ;bb432.i
    106 LBB1_83:        ;bb420.i
    107         lbzx r8, r5, r7
    108         addi r6, r7, 1
    109         stbx r8, r4, r7
    110 LBB1_84:        ;bb432.i
    111         mr r7, r6
    112         cmplwi cr0, r7, 143
    113         bne cr0, LBB1_83        ;bb420.i
    114 
    115 The CBE manages to produce:
    116 
    117 	li r0, 143
    118 	mtctr r0
    119 loop:
    120 	lbzx r2, r2, r11
    121 	stbx r0, r2, r9
    122 	addi r2, r2, 1
    123 	bdz later
    124 	b loop
    125 
    126 This could be much better (bdnz instead of bdz) but it still beats us.  If we
    127 produced this with bdnz, the loop would be a single dispatch group.
    128 
    129 ===-------------------------------------------------------------------------===
    130 
    131 Compile:
    132 
    133 void foo(int *P) {
    134  if (P)  *P = 0;
    135 }
    136 
    137 into:
    138 
    139 _foo:
    140         cmpwi cr0,r3,0
    141         beqlr cr0
    142         li r0,0
    143         stw r0,0(r3)
    144         blr
    145 
    146 This is effectively a simple form of predication.
    147 
    148 ===-------------------------------------------------------------------------===
    149 
    150 Lump the constant pool for each function into ONE pic object, and reference
    151 pieces of it as offsets from the start.  For functions like this (contrived
    152 to have lots of constants obviously):
    153 
    154 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
    155 
    156 We generate:
    157 
    158 _X:
    159         lis r2, ha16(.CPI_X_0)
    160         lfd f0, lo16(.CPI_X_0)(r2)
    161         lis r2, ha16(.CPI_X_1)
    162         lfd f2, lo16(.CPI_X_1)(r2)
    163         fmadd f0, f1, f0, f2
    164         lis r2, ha16(.CPI_X_2)
    165         lfd f1, lo16(.CPI_X_2)(r2)
    166         lis r2, ha16(.CPI_X_3)
    167         lfd f2, lo16(.CPI_X_3)(r2)
    168         fmadd f1, f0, f1, f2
    169         blr
    170 
    171 It would be better to materialize .CPI_X into a register, then use immediates
    172 off of the register to avoid the lis's.  This is even more important in PIC 
    173 mode.
    174 
    175 Note that this (and the static variable version) is discussed here for GCC:
    176 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
    177 
    178 Here's another example (the sgn function):
    179 double testf(double a) {
    180        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
    181 }
    182 
    183 it produces a BB like this:
    184 LBB1_1: ; cond_true
    185         lis r2, ha16(LCPI1_0)
    186         lfs f0, lo16(LCPI1_0)(r2)
    187         lis r2, ha16(LCPI1_1)
    188         lis r3, ha16(LCPI1_2)
    189         lfs f2, lo16(LCPI1_2)(r3)
    190         lfs f3, lo16(LCPI1_1)(r2)
    191         fsub f0, f0, f1
    192         fsel f1, f0, f2, f3
    193         blr 
    194 
    195 ===-------------------------------------------------------------------------===
    196 
    197 PIC Code Gen IPO optimization:
    198 
    199 Squish small scalar globals together into a single global struct, allowing the 
    200 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
    201 of the GOT on targets with one).
    202 
    203 Note that this is discussed here for GCC:
    204 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
    205 
    206 ===-------------------------------------------------------------------------===
    207 
    208 Implement Newton-Rhapson method for improving estimate instructions to the
    209 correct accuracy, and implementing divide as multiply by reciprocal when it has
    210 more than one use.  Itanium would want this too.
    211 
    212 ===-------------------------------------------------------------------------===
    213 
    214 Compile offsets from allocas:
    215 
    216 int *%test() {
    217         %X = alloca { int, int }
    218         %Y = getelementptr {int,int}* %X, int 0, uint 1
    219         ret int* %Y
    220 }
    221 
    222 into a single add, not two:
    223 
    224 _test:
    225         addi r2, r1, -8
    226         addi r3, r2, 4
    227         blr
    228 
    229 --> important for C++.
    230 
    231 ===-------------------------------------------------------------------------===
    232 
    233 No loads or stores of the constants should be needed:
    234 
    235 struct foo { double X, Y; };
    236 void xxx(struct foo F);
    237 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
    238 
    239 ===-------------------------------------------------------------------------===
    240 
    241 Darwin Stub removal:
    242 
    243 We still generate calls to foo$stub, and stubs, on Darwin.  This is not
    244 necessary when building with the Leopard (10.5) or later linker, as stubs are
    245 generated by ld when necessary.  Parameterizing this based on the deployment
    246 target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
    247 its logic.
    248 
    249 ===-------------------------------------------------------------------------===
    250 
    251 Darwin Stub LICM optimization:
    252 
    253 Loops like this:
    254   
    255   for (...)  bar();
    256 
    257 Have to go through an indirect stub if bar is external or linkonce.  It would 
    258 be better to compile it as:
    259 
    260      fp = &bar;
    261      for (...)  fp();
    262 
    263 which only computes the address of bar once (instead of each time through the 
    264 stub).  This is Darwin specific and would have to be done in the code generator.
    265 Probably not a win on x86.
    266 
    267 ===-------------------------------------------------------------------------===
    268 
    269 Simple IPO for argument passing, change:
    270   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
    271 
    272 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
    273 of arguments get assigned to r3 through r10. That is, if you have a function
    274 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
    275 argument bytes for r4 and r5. The trick then would be to shuffle the argument
    276 order for functions we can internalize so that the maximum number of 
    277 integers/pointers get passed in regs before you see any of the fp arguments.
    278 
    279 Instead of implementing this, it would actually probably be easier to just 
    280 implement a PPC fastcc, where we could do whatever we wanted to the CC, 
    281 including having this work sanely.
    282 
    283 ===-------------------------------------------------------------------------===
    284 
    285 Fix Darwin FP-In-Integer Registers ABI
    286 
    287 Darwin passes doubles in structures in integer registers, which is very very 
    288 bad.  Add something like a BITCAST to LLVM, then do an i-p transformation that
    289 percolates these things out of functions.
    290 
    291 Check out how horrible this is:
    292 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
    293 
    294 This is an extension of "interprocedural CC unmunging" that can't be done with
    295 just fastcc.
    296 
    297 ===-------------------------------------------------------------------------===
    298 
    299 Compile this:
    300 
    301 int foo(int a) {
    302   int b = (a < 8);
    303   if (b) {
    304     return b * 3;     // ignore the fact that this is always 3.
    305   } else {
    306     return 2;
    307   }
    308 }
    309 
    310 into something not this:
    311 
    312 _foo:
    313 1)      cmpwi cr7, r3, 8
    314         mfcr r2, 1
    315         rlwinm r2, r2, 29, 31, 31
    316 1)      cmpwi cr0, r3, 7
    317         bgt cr0, LBB1_2 ; UnifiedReturnBlock
    318 LBB1_1: ; then
    319         rlwinm r2, r2, 0, 31, 31
    320         mulli r3, r2, 3
    321         blr
    322 LBB1_2: ; UnifiedReturnBlock
    323         li r3, 2
    324         blr
    325 
    326 In particular, the two compares (marked 1) could be shared by reversing one.
    327 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
    328 same operands (but backwards) exists.  In this case, this wouldn't save us 
    329 anything though, because the compares still wouldn't be shared.
    330 
    331 ===-------------------------------------------------------------------------===
    332 
    333 We should custom expand setcc instead of pretending that we have it.  That
    334 would allow us to expose the access of the crbit after the mfcr, allowing
    335 that access to be trivially folded into other ops.  A simple example:
    336 
    337 int foo(int a, int b) { return (a < b) << 4; }
    338 
    339 compiles into:
    340 
    341 _foo:
    342         cmpw cr7, r3, r4
    343         mfcr r2, 1
    344         rlwinm r2, r2, 29, 31, 31
    345         slwi r3, r2, 4
    346         blr
    347 
    348 ===-------------------------------------------------------------------------===
    349 
    350 Fold add and sub with constant into non-extern, non-weak addresses so this:
    351 
    352 static int a;
    353 void bar(int b) { a = b; }
    354 void foo(unsigned char *c) {
    355   *c = a;
    356 }
    357 
    358 So that 
    359 
    360 _foo:
    361         lis r2, ha16(_a)
    362         la r2, lo16(_a)(r2)
    363         lbz r2, 3(r2)
    364         stb r2, 0(r3)
    365         blr
    366 
    367 Becomes
    368 
    369 _foo:
    370         lis r2, ha16(_a+3)
    371         lbz r2, lo16(_a+3)(r2)
    372         stb r2, 0(r3)
    373         blr
    374 
    375 ===-------------------------------------------------------------------------===
    376 
    377 We generate really bad code for this:
    378 
    379 int f(signed char *a, _Bool b, _Bool c) {
    380    signed char t = 0;
    381   if (b)  t = *a;
    382   if (c)  *a = t;
    383 }
    384 
    385 ===-------------------------------------------------------------------------===
    386 
    387 This:
    388 int test(unsigned *P) { return *P >> 24; }
    389 
    390 Should compile to:
    391 
    392 _test:
    393         lbz r3,0(r3)
    394         blr
    395 
    396 not:
    397 
    398 _test:
    399         lwz r2, 0(r3)
    400         srwi r3, r2, 24
    401         blr
    402 
    403 ===-------------------------------------------------------------------------===
    404 
    405 On the G5, logical CR operations are more expensive in their three
    406 address form: ops that read/write the same register are half as expensive as
    407 those that read from two registers that are different from their destination.
    408 
    409 We should model this with two separate instructions.  The isel should generate
    410 the "two address" form of the instructions.  When the register allocator 
    411 detects that it needs to insert a copy due to the two-addresness of the CR
    412 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
    413 we can convert to the "three address" instruction, to save code space.
    414 
    415 This only matters when we start generating cr logical ops.
    416 
    417 ===-------------------------------------------------------------------------===
    418 
    419 We should compile these two functions to the same thing:
    420 
    421 #include <stdlib.h>
    422 void f(int a, int b, int *P) {
    423   *P = (a-b)>=0?(a-b):(b-a);
    424 }
    425 void g(int a, int b, int *P) {
    426   *P = abs(a-b);
    427 }
    428 
    429 Further, they should compile to something better than:
    430 
    431 _g:
    432         subf r2, r4, r3
    433         subfic r3, r2, 0
    434         cmpwi cr0, r2, -1
    435         bgt cr0, LBB2_2 ; entry
    436 LBB2_1: ; entry
    437         mr r2, r3
    438 LBB2_2: ; entry
    439         stw r2, 0(r5)
    440         blr
    441 
    442 GCC produces:
    443 
    444 _g:
    445         subf r4,r4,r3
    446         srawi r2,r4,31
    447         xor r0,r2,r4
    448         subf r0,r2,r0
    449         stw r0,0(r5)
    450         blr
    451 
    452 ... which is much nicer.
    453 
    454 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
    455 
    456 ===-------------------------------------------------------------------------===
    457 
    458 PR5945: This: 
    459 define i32 @clamp0g(i32 %a) {
    460 entry:
    461         %cmp = icmp slt i32 %a, 0
    462         %sel = select i1 %cmp, i32 0, i32 %a
    463         ret i32 %sel
    464 }
    465 
    466 Is compile to this with the PowerPC (32-bit) backend:
    467 
    468 _clamp0g:
    469         cmpwi cr0, r3, 0
    470         li r2, 0
    471         blt cr0, LBB1_2
    472 ; BB#1:                                                     ; %entry
    473         mr r2, r3
    474 LBB1_2:                                                     ; %entry
    475         mr r3, r2
    476         blr
    477 
    478 This could be reduced to the much simpler:
    479 
    480 _clamp0g:
    481         srawi r2, r3, 31
    482         andc r3, r3, r2
    483         blr
    484 
    485 ===-------------------------------------------------------------------------===
    486 
    487 int foo(int N, int ***W, int **TK, int X) {
    488   int t, i;
    489   
    490   for (t = 0; t < N; ++t)
    491     for (i = 0; i < 4; ++i)
    492       W[t / X][i][t % X] = TK[i][t];
    493       
    494   return 5;
    495 }
    496 
    497 We generate relatively atrocious code for this loop compared to gcc.
    498 
    499 We could also strength reduce the rem and the div:
    500 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
    501 
    502 ===-------------------------------------------------------------------------===
    503 
    504 float foo(float X) { return (int)(X); }
    505 
    506 Currently produces:
    507 
    508 _foo:
    509         fctiwz f0, f1
    510         stfd f0, -8(r1)
    511         lwz r2, -4(r1)
    512         extsw r2, r2
    513         std r2, -16(r1)
    514         lfd f0, -16(r1)
    515         fcfid f0, f0
    516         frsp f1, f0
    517         blr
    518 
    519 We could use a target dag combine to turn the lwz/extsw into an lwa when the 
    520 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
    521 win only.
    522 
    523 ===-------------------------------------------------------------------------===
    524 
    525 We generate ugly code for this:
    526 
    527 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
    528   unsigned code = 0;
    529   if(dx < -dw) code |= 1;
    530   if(dx > dw)  code |= 2;
    531   if(dy < -dw) code |= 4;
    532   if(dy > dw)  code |= 8;
    533   if(dz < -dw) code |= 16;
    534   if(dz > dw)  code |= 32;
    535   *ret = code;
    536 }
    537 
    538 ===-------------------------------------------------------------------------===
    539 
    540 Complete the signed i32 to FP conversion code using 64-bit registers
    541 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
    542 
    543      // FIXME: disable this lowered code.  This generates 64-bit register values,
    544      // and we don't model the fact that the top part is clobbered by calls.  We
    545      // need to flag these together so that the value isn't live across a call.
    546      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
    547 
    548 Also, if the registers are spilled to the stack, we have to ensure that all
    549 64-bits of them are save/restored, otherwise we will miscompile the code.  It
    550 sounds like we need to get the 64-bit register classes going.
    551 
    552 ===-------------------------------------------------------------------------===
    553 
    554 %struct.B = type { i8, [3 x i8] }
    555 
    556 define void @bar(%struct.B* %b) {
    557 entry:
    558         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
    559         %tmp = load i32* %tmp          ; <uint> [#uses=1]
    560         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
    561         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
    562         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
    563         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
    564         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
    565         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
    566         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
    567         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
    568         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
    569         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
    570         store i32 %tmp13, i32* %tmp8
    571         ret void
    572 }
    573 
    574 We emit:
    575 
    576 _foo:
    577         lwz r2, 0(r3)
    578         slwi r4, r2, 1
    579         or r4, r4, r2
    580         rlwimi r2, r4, 0, 0, 0
    581         stw r2, 0(r3)
    582         blr
    583 
    584 We could collapse a bunch of those ORs and ANDs and generate the following
    585 equivalent code:
    586 
    587 _foo:
    588         lwz r2, 0(r3)
    589         rlwinm r4, r2, 1, 0, 0
    590         or r2, r2, r4
    591         stw r2, 0(r3)
    592         blr
    593 
    594 ===-------------------------------------------------------------------------===
    595 
    596 We compile:
    597 
    598 unsigned test6(unsigned x) { 
    599   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
    600 }
    601 
    602 into:
    603 
    604 _test6:
    605         lis r2, 255
    606         rlwinm r3, r3, 16, 0, 31
    607         ori r2, r2, 255
    608         and r3, r3, r2
    609         blr
    610 
    611 GCC gets it down to:
    612 
    613 _test6:
    614         rlwinm r0,r3,16,8,15
    615         rlwinm r3,r3,16,24,31
    616         or r3,r3,r0
    617         blr
    618 
    619 
    620 ===-------------------------------------------------------------------------===
    621 
    622 Consider a function like this:
    623 
    624 float foo(float X) { return X + 1234.4123f; }
    625 
    626 The FP constant ends up in the constant pool, so we need to get the LR register.
    627  This ends up producing code like this:
    628 
    629 _foo:
    630 .LBB_foo_0:     ; entry
    631         mflr r11
    632 ***     stw r11, 8(r1)
    633         bl "L00000$pb"
    634 "L00000$pb":
    635         mflr r2
    636         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
    637         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
    638         fadds f1, f1, f0
    639 ***     lwz r11, 8(r1)
    640         mtlr r11
    641         blr
    642 
    643 This is functional, but there is no reason to spill the LR register all the way
    644 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
    645 
    646 Implementing this will require some codegen improvements.  Nate writes:
    647 
    648 "So basically what we need to support the "no stack frame save and restore" is a
    649 generalization of the LR optimization to "callee-save regs".
    650 
    651 Currently, we have LR marked as a callee-save reg.  The register allocator sees
    652 that it's callee save, and spills it directly to the stack.
    653 
    654 Ideally, something like this would happen:
    655 
    656 LR would be in a separate register class from the GPRs. The class of LR would be
    657 marked "unspillable".  When the register allocator came across an unspillable
    658 reg, it would ask "what is the best class to copy this into that I *can* spill"
    659 If it gets a class back, which it will in this case (the gprs), it grabs a free
    660 register of that class.  If it is then later necessary to spill that reg, so be
    661 it.
    662 
    663 ===-------------------------------------------------------------------------===
    664 
    665 We compile this:
    666 int test(_Bool X) {
    667   return X ? 524288 : 0;
    668 }
    669 
    670 to: 
    671 _test:
    672         cmplwi cr0, r3, 0
    673         lis r2, 8
    674         li r3, 0
    675         beq cr0, LBB1_2 ;entry
    676 LBB1_1: ;entry
    677         mr r3, r2
    678 LBB1_2: ;entry
    679         blr 
    680 
    681 instead of:
    682 _test:
    683         addic r2,r3,-1
    684         subfe r0,r2,r3
    685         slwi r3,r0,19
    686         blr
    687 
    688 This sort of thing occurs a lot due to globalopt.
    689 
    690 ===-------------------------------------------------------------------------===
    691 
    692 We compile:
    693 
    694 define i32 @bar(i32 %x) nounwind readnone ssp {
    695 entry:
    696   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
    697   %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
    698   ret i32 %neg
    699 }
    700 
    701 to:
    702 
    703 _bar:
    704 	cntlzw r2, r3
    705 	slwi r2, r2, 26
    706 	srawi r3, r2, 31
    707 	blr 
    708 
    709 it would be better to produce:
    710 
    711 _bar: 
    712         addic r3,r3,-1
    713         subfe r3,r3,r3
    714         blr
    715 
    716 ===-------------------------------------------------------------------------===
    717 
    718 We currently compile 32-bit bswap:
    719 
    720 declare i32 @llvm.bswap.i32(i32 %A)
    721 define i32 @test(i32 %A) {
    722         %B = call i32 @llvm.bswap.i32(i32 %A)
    723         ret i32 %B
    724 }
    725 
    726 to:
    727 
    728 _test:
    729         rlwinm r2, r3, 24, 16, 23
    730         slwi r4, r3, 24
    731         rlwimi r2, r3, 8, 24, 31
    732         rlwimi r4, r3, 8, 8, 15
    733         rlwimi r4, r2, 0, 16, 31
    734         mr r3, r4
    735         blr 
    736 
    737 it would be more efficient to produce:
    738 
    739 _foo:   mr r0,r3
    740         rlwinm r3,r3,8,0xffffffff
    741         rlwimi r3,r0,24,0,7
    742         rlwimi r3,r0,24,16,23
    743         blr
    744 
    745 ===-------------------------------------------------------------------------===
    746 
    747 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
    748 
    749 __ZNK4llvm5APInt17countLeadingZerosEv:
    750         ld r2, 0(r3)
    751         cntlzd r2, r2
    752         or r2, r2, r2     <<-- silly.
    753         addi r3, r2, -64
    754         blr 
    755 
    756 The dead or is a 'truncate' from 64- to 32-bits.
    757 
    758 ===-------------------------------------------------------------------------===
    759 
    760 We generate horrible ppc code for this:
    761 
    762 #define N  2000000
    763 double   a[N],c[N];
    764 void simpleloop() {
    765    int j;
    766    for (j=0; j<N; j++)
    767      c[j] = a[j];
    768 }
    769 
    770 LBB1_1: ;bb
    771         lfdx f0, r3, r4
    772         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
    773         stfdx f0, r2, r4
    774         addi r4, r4, 8
    775 
    776         xoris r6, r5, 30               ;; This is due to a large immediate.
    777         cmplwi cr0, r6, 33920
    778         bne cr0, LBB1_1
    779 
    780 //===---------------------------------------------------------------------===//
    781 
    782 This:
    783         #include <algorithm>
    784         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
    785         { return std::make_pair(a + b, a + b < a); }
    786         bool no_overflow(unsigned a, unsigned b)
    787         { return !full_add(a, b).second; }
    788 
    789 Should compile to:
    790 
    791 __Z11no_overflowjj:
    792         add r4,r3,r4
    793         subfc r3,r3,r4
    794         li r3,0
    795         adde r3,r3,r3
    796         blr
    797 
    798 (or better) not:
    799 
    800 __Z11no_overflowjj:
    801         add r2, r4, r3
    802         cmplw cr7, r2, r3
    803         mfcr r2
    804         rlwinm r2, r2, 29, 31, 31
    805         xori r3, r2, 1
    806         blr 
    807 
    808 //===---------------------------------------------------------------------===//
    809 
    810 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
    811 example:
    812 #include <math.h>
    813 int test(double x, double y) { return islessequal(x, y);}
    814 int test2(double x, double y) {  return islessgreater(x, y);}
    815 int test3(double x, double y) {  return !islessequal(x, y);}
    816 
    817 Compiles into (all three are similar, but the bits differ):
    818 
    819 _test:
    820 	fcmpu cr7, f1, f2
    821 	mfcr r2
    822 	rlwinm r3, r2, 29, 31, 31
    823 	rlwinm r2, r2, 31, 31, 31
    824 	or r3, r2, r3
    825 	blr 
    826 
    827 GCC compiles this into:
    828 
    829  _test:
    830 	fcmpu cr7,f1,f2
    831 	cror 30,28,30
    832 	mfcr r3
    833 	rlwinm r3,r3,31,1
    834 	blr
    835         
    836 which is more efficient and can use mfocr.  See PR642 for some more context.
    837 
    838 //===---------------------------------------------------------------------===//
    839 
    840 void foo(float *data, float d) {
    841    long i;
    842    for (i = 0; i < 8000; i++)
    843       data[i] = d;
    844 }
    845 void foo2(float *data, float d) {
    846    long i;
    847    data--;
    848    for (i = 0; i < 8000; i++) {
    849       data[1] = d;
    850       data++;
    851    }
    852 }
    853 
    854 These compile to:
    855 
    856 _foo:
    857 	li r2, 0
    858 LBB1_1:	; bb
    859 	addi r4, r2, 4
    860 	stfsx f1, r3, r2
    861 	cmplwi cr0, r4, 32000
    862 	mr r2, r4
    863 	bne cr0, LBB1_1	; bb
    864 	blr 
    865 _foo2:
    866 	li r2, 0
    867 LBB2_1:	; bb
    868 	addi r4, r2, 4
    869 	stfsx f1, r3, r2
    870 	cmplwi cr0, r4, 32000
    871 	mr r2, r4
    872 	bne cr0, LBB2_1	; bb
    873 	blr 
    874 
    875 The 'mr' could be eliminated to folding the add into the cmp better.
    876 
    877 //===---------------------------------------------------------------------===//
    878 Codegen for the following (low-probability) case deteriorated considerably 
    879 when the correctness fixes for unordered comparisons went in (PR 642, 58871).
    880 It should be possible to recover the code quality described in the comments.
    881 
    882 ; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
    883 ; This should produce one 'or' or 'cror' instruction per function.
    884 
    885 ; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
    886 ; PR2964
    887 
    888 define i32 @test(double %x, double %y) nounwind  {
    889 entry:
    890 	%tmp3 = fcmp ole double %x, %y		; <i1> [#uses=1]
    891 	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
    892 	ret i32 %tmp345
    893 }
    894 
    895 define i32 @test2(double %x, double %y) nounwind  {
    896 entry:
    897 	%tmp3 = fcmp one double %x, %y		; <i1> [#uses=1]
    898 	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
    899 	ret i32 %tmp345
    900 }
    901 
    902 define i32 @test3(double %x, double %y) nounwind  {
    903 entry:
    904 	%tmp3 = fcmp ugt double %x, %y		; <i1> [#uses=1]
    905 	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
    906 	ret i32 %tmp34
    907 }
    908 //===----------------------------------------------------------------------===//
    909 ; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
    910 
    911 ; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and 
    912 ; should not be generated except with -enable-finite-only-fp-math or the like).
    913 ; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
    914 ; recognize a more elaborate tree than a simple SETxx.
    915 
    916 define double @test_FNEG_sel(double %A, double %B, double %C) {
    917         %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
    918         %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
    919         %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
    920         ret double %E
    921 }
    922 
    923 //===----------------------------------------------------------------------===//
    924 The save/restore sequence for CR in prolog/epilog is terrible:
    925 - Each CR subreg is saved individually, rather than doing one save as a unit.
    926 - On Darwin, the save is done after the decrement of SP, which means the offset
    927 from SP of the save slot can be too big for a store instruction, which means we
    928 need an additional register (currently hacked in 96015+96020; the solution there
    929 is correct, but poor).
    930 - On SVR4 the same thing can happen, and I don't think saving before the SP
    931 decrement is safe on that target, as there is no red zone.  This is currently
    932 broken AFAIK, although it's not a target I can exercise.
    933 The following demonstrates the problem:
    934 extern void bar(char *p);
    935 void foo() {
    936   char x[100000];
    937   bar(x);
    938   __asm__("" ::: "cr2");
    939 }
    940 

README_ALTIVEC.txt

      1 //===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
      2 
      3 Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
      4 registers, to generate better spill code.
      5 
      6 //===----------------------------------------------------------------------===//
      7 
      8 The first should be a single lvx from the constant pool, the second should be 
      9 a xor/stvx:
     10 
     11 void foo(void) {
     12   int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
     13   bar (x);
     14 }
     15 
     16 #include <string.h>
     17 void foo(void) {
     18   int x[8] __attribute__((aligned(128)));
     19   memset (x, 0, sizeof (x));
     20   bar (x);
     21 }
     22 
     23 //===----------------------------------------------------------------------===//
     24 
     25 Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
     26 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
     27 
     28 When -ffast-math is on, we can use 0.0.
     29 
     30 //===----------------------------------------------------------------------===//
     31 
     32   Consider this:
     33   v4f32 Vector;
     34   v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
     35 
     36 Since we know that "Vector" is 16-byte aligned and we know the element offset 
     37 of ".X", we should change the load into a lve*x instruction, instead of doing
     38 a load/store/lve*x sequence.
     39 
     40 //===----------------------------------------------------------------------===//
     41 
     42 For functions that use altivec AND have calls, we are VRSAVE'ing all call
     43 clobbered regs.
     44 
     45 //===----------------------------------------------------------------------===//
     46 
     47 Implement passing vectors by value into calls and receiving them as arguments.
     48 
     49 //===----------------------------------------------------------------------===//
     50 
     51 GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
     52 of C1/C2/C3, then a load and vperm of Variable.
     53 
     54 //===----------------------------------------------------------------------===//
     55 
     56 We need a way to teach tblgen that some operands of an intrinsic are required to
     57 be constants.  The verifier should enforce this constraint.
     58 
     59 //===----------------------------------------------------------------------===//
     60 
     61 We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
     62 aligned stack slot, followed by a load/vperm.  We should probably just store it
     63 to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
     64 in memory this is a big win.
     65 
     66 //===----------------------------------------------------------------------===//
     67 
     68 extract_vector_elt of an arbitrary constant vector can be done with the 
     69 following instructions:
     70 
     71 vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
     72 vec_ste(&destloc,0,vTemp);
     73 
     74 We can do an arbitrary non-constant value by using lvsr/perm/ste.
     75 
     76 //===----------------------------------------------------------------------===//
     77 
     78 If we want to tie instruction selection into the scheduler, we can do some
     79 constant formation with different instructions.  For example, we can generate
     80 "vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
     81 "vsplti 0" or "vxor", each of which use different execution units, thus could
     82 help scheduling.
     83 
     84 This is probably only reasonable for a post-pass scheduler.
     85 
     86 //===----------------------------------------------------------------------===//
     87 
     88 For this function:
     89 
     90 void test(vector float *A, vector float *B) {
     91   vector float C = (vector float)vec_cmpeq(*A, *B);
     92   if (!vec_any_eq(*A, *B))
     93     *B = (vector float){0,0,0,0};
     94   *A = C;
     95 }
     96 
     97 we get the following basic block:
     98 
     99 	...
    100         lvx v2, 0, r4
    101         lvx v3, 0, r3
    102         vcmpeqfp v4, v3, v2
    103         vcmpeqfp. v2, v3, v2
    104         bne cr6, LBB1_2 ; cond_next
    105 
    106 The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
    107 vcmpeqfp. result is used by a branch.  This can be improved.
    108 
    109 //===----------------------------------------------------------------------===//
    110 
    111 The code generated for this is truly aweful:
    112 
    113 vector float test(float a, float b) {
    114  return (vector float){ 0.0, a, 0.0, 0.0}; 
    115 }
    116 
    117 LCPI1_0:                                        ;  float
    118         .space  4
    119         .text
    120         .globl  _test
    121         .align  4
    122 _test:
    123         mfspr r2, 256
    124         oris r3, r2, 4096
    125         mtspr 256, r3
    126         lis r3, ha16(LCPI1_0)
    127         addi r4, r1, -32
    128         stfs f1, -16(r1)
    129         addi r5, r1, -16
    130         lfs f0, lo16(LCPI1_0)(r3)
    131         stfs f0, -32(r1)
    132         lvx v2, 0, r4
    133         lvx v3, 0, r5
    134         vmrghw v3, v3, v2
    135         vspltw v2, v2, 0
    136         vmrghw v2, v2, v3
    137         mtspr 256, r2
    138         blr
    139 
    140 //===----------------------------------------------------------------------===//
    141 
    142 int foo(vector float *x, vector float *y) {
    143         if (vec_all_eq(*x,*y)) return 3245; 
    144         else return 12;
    145 }
    146 
    147 A predicate compare being used in a select_cc should have the same peephole
    148 applied to it as a predicate compare used by a br_cc.  There should be no
    149 mfcr here:
    150 
    151 _foo:
    152         mfspr r2, 256
    153         oris r5, r2, 12288
    154         mtspr 256, r5
    155         li r5, 12
    156         li r6, 3245
    157         lvx v2, 0, r4
    158         lvx v3, 0, r3
    159         vcmpeqfp. v2, v3, v2
    160         mfcr r3, 2
    161         rlwinm r3, r3, 25, 31, 31
    162         cmpwi cr0, r3, 0
    163         bne cr0, LBB1_2 ; entry
    164 LBB1_1: ; entry
    165         mr r6, r5
    166 LBB1_2: ; entry
    167         mr r3, r6
    168         mtspr 256, r2
    169         blr
    170 
    171 //===----------------------------------------------------------------------===//
    172 
    173 CodeGen/PowerPC/vec_constants.ll has an and operation that should be
    174 codegen'd to andc.  The issue is that the 'all ones' build vector is
    175 SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
    176 which prevents the vnot pattern from matching.
    177 
    178 
    179 //===----------------------------------------------------------------------===//
    180 
    181 An alternative to the store/store/load approach for illegal insert element 
    182 lowering would be:
    183 
    184 1. store element to any ol' slot
    185 2. lvx the slot
    186 3. lvsl 0; splat index; vcmpeq to generate a select mask
    187 4. lvsl slot + x; vperm to rotate result into correct slot
    188 5. vsel result together.
    189 
    190 //===----------------------------------------------------------------------===//
    191 
    192 Should codegen branches on vec_any/vec_all to avoid mfcr.  Two examples:
    193 
    194 #include <altivec.h>
    195  int f(vector float a, vector float b)
    196  {
    197   int aa = 0;
    198   if (vec_all_ge(a, b))
    199     aa |= 0x1;
    200   if (vec_any_ge(a,b))
    201     aa |= 0x2;
    202   return aa;
    203 }
    204 
    205 vector float f(vector float a, vector float b) { 
    206   if (vec_any_eq(a, b)) 
    207     return a; 
    208   else 
    209     return b; 
    210 }
    211 
    212