Home | History | Annotate | only in /external/llvm/lib/Target/PowerPC
Up to higher level directory
NameDateSize
CMakeLists.txt20-Sep-20131.1K
InstPrinter/20-Sep-2013
LLVMBuild.txt20-Sep-2013992
Makefile20-Sep-2013785
MCTargetDesc/20-Sep-2013
PPC.h20-Sep-20133K
PPC.td20-Sep-20139.7K
PPCAsmPrinter.cpp20-Sep-201342.5K
PPCBranchSelector.cpp20-Sep-20136.6K
PPCCallingConv.td20-Sep-20136.4K
PPCCodeEmitter.cpp20-Sep-201310K
PPCCTRLoops.cpp20-Sep-201327.7K
PPCFrameLowering.cpp20-Sep-201345.7K
PPCFrameLowering.h20-Sep-20138.9K
PPCHazardRecognizers.cpp20-Sep-20138.2K
PPCHazardRecognizers.h20-Sep-20133.3K
PPCInstr64Bit.td20-Sep-201344.2K
PPCInstrAltivec.td20-Sep-201335.6K
PPCInstrBuilder.h20-Sep-20131.4K
PPCInstrFormats.td20-Sep-201325.7K
PPCInstrInfo.cpp20-Sep-201327.1K
PPCInstrInfo.h20-Sep-20136.5K
PPCInstrInfo.td20-Sep-201372.5K
PPCISelDAGToDAG.cpp20-Sep-201359.2K
PPCISelLowering.cpp20-Sep-2013280K
PPCISelLowering.h20-Sep-201327.3K
PPCJITInfo.cpp20-Sep-201317.6K
PPCJITInfo.h20-Sep-20131.6K
PPCMachineFunctionInfo.cpp20-Sep-2013438
PPCMachineFunctionInfo.h20-Sep-20135.7K
PPCMCInstLower.cpp20-Sep-20136.9K
PPCPerfectShuffle.h20-Sep-2013397.4K
PPCRegisterInfo.cpp20-Sep-201319.4K
PPCRegisterInfo.h20-Sep-20132.8K
PPCRegisterInfo.td20-Sep-20137.5K
PPCRelocations.h20-Sep-20131.9K
PPCSchedule.td20-Sep-201314.7K
PPCSchedule440.td20-Sep-201335.5K
PPCScheduleA2.td20-Sep-201348.9K
PPCScheduleE500mc.td20-Sep-201314.7K
PPCScheduleE5500.td20-Sep-201317.7K
PPCScheduleG3.td20-Sep-20133.5K
PPCScheduleG4.td20-Sep-20134.1K
PPCScheduleG4Plus.td20-Sep-20134.6K
PPCScheduleG5.td20-Sep-20135K
PPCSelectionDAGInfo.cpp20-Sep-2013737
PPCSelectionDAGInfo.h20-Sep-2013830
PPCSubtarget.cpp20-Sep-20135K
PPCSubtarget.h20-Sep-20136K
PPCTargetMachine.cpp20-Sep-20134.6K
PPCTargetMachine.h20-Sep-20133.3K
PPCTargetTransformInfo.cpp20-Sep-20137.9K
README.txt20-Sep-201324K
README_ALTIVEC.txt20-Sep-20136.2K
TargetInfo/20-Sep-2013

README.txt

      1 //===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
      2 
      3 TODO:
      4 * gpr0 allocation
      5 * lmw/stmw pass a la arm load store optimizer for prolog/epilog
      6 
      7 ===-------------------------------------------------------------------------===
      8 
      9 On PPC64, this:
     10 
     11 long f2 (long x) { return 0xfffffff000000000UL; }
     12 long f3 (long x) { return 0x1ffffffffUL; }
     13 
     14 could compile into:
     15 
     16 _f2:
     17 	li r3,-1
     18 	rldicr r3,r3,0,27
     19 	blr
     20 _f3:
     21 	li r3,-1
     22 	rldicl r3,r3,0,31
     23 	blr
     24 
     25 we produce:
     26 
     27 _f2:
     28 	lis r2, 4095
     29 	ori r2, r2, 65535
     30 	sldi r3, r2, 36
     31 	blr 
     32 _f3:
     33 	li r2, 1
     34 	sldi r2, r2, 32
     35 	oris r2, r2, 65535
     36 	ori r3, r2, 65535
     37 	blr 
     38 
     39 ===-------------------------------------------------------------------------===
     40 
     41 This code:
     42 
     43 unsigned add32carry(unsigned sum, unsigned x) {
     44  unsigned z = sum + x;
     45  if (sum + x < x)
     46      z++;
     47  return z;
     48 }
     49 
     50 Should compile to something like:
     51 
     52 	addc r3,r3,r4
     53 	addze r3,r3
     54 
     55 instead we get:
     56 
     57 	add r3, r4, r3
     58 	cmplw cr7, r3, r4
     59 	mfcr r4 ; 1
     60 	rlwinm r4, r4, 29, 31, 31
     61 	add r3, r3, r4
     62 
     63 Ick.
     64 
     65 ===-------------------------------------------------------------------------===
     66 
     67 Support 'update' load/store instructions.  These are cracked on the G5, but are
     68 still a codesize win.
     69 
     70 With preinc enabled, this:
     71 
     72 long *%test4(long *%X, long *%dest) {
     73         %Y = getelementptr long* %X, int 4
     74         %A = load long* %Y
     75         store long %A, long* %dest
     76         ret long* %Y
     77 }
     78 
     79 compiles to:
     80 
     81 _test4:
     82         mr r2, r3
     83         lwzu r5, 32(r2)
     84         lwz r3, 36(r3)
     85         stw r5, 0(r4)
     86         stw r3, 4(r4)
     87         mr r3, r2
     88         blr 
     89 
     90 with -sched=list-burr, I get:
     91 
     92 _test4:
     93         lwz r2, 36(r3)
     94         lwzu r5, 32(r3)
     95         stw r2, 4(r4)
     96         stw r5, 0(r4)
     97         blr 
     98 
     99 ===-------------------------------------------------------------------------===
    100 
    101 We compile the hottest inner loop of viterbi to:
    102 
    103         li r6, 0
    104         b LBB1_84       ;bb432.i
    105 LBB1_83:        ;bb420.i
    106         lbzx r8, r5, r7
    107         addi r6, r7, 1
    108         stbx r8, r4, r7
    109 LBB1_84:        ;bb432.i
    110         mr r7, r6
    111         cmplwi cr0, r7, 143
    112         bne cr0, LBB1_83        ;bb420.i
    113 
    114 The CBE manages to produce:
    115 
    116 	li r0, 143
    117 	mtctr r0
    118 loop:
    119 	lbzx r2, r2, r11
    120 	stbx r0, r2, r9
    121 	addi r2, r2, 1
    122 	bdz later
    123 	b loop
    124 
    125 This could be much better (bdnz instead of bdz) but it still beats us.  If we
    126 produced this with bdnz, the loop would be a single dispatch group.
    127 
    128 ===-------------------------------------------------------------------------===
    129 
    130 Compile:
    131 
    132 void foo(int *P) {
    133  if (P)  *P = 0;
    134 }
    135 
    136 into:
    137 
    138 _foo:
    139         cmpwi cr0,r3,0
    140         beqlr cr0
    141         li r0,0
    142         stw r0,0(r3)
    143         blr
    144 
    145 This is effectively a simple form of predication.
    146 
    147 ===-------------------------------------------------------------------------===
    148 
    149 Lump the constant pool for each function into ONE pic object, and reference
    150 pieces of it as offsets from the start.  For functions like this (contrived
    151 to have lots of constants obviously):
    152 
    153 double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
    154 
    155 We generate:
    156 
    157 _X:
    158         lis r2, ha16(.CPI_X_0)
    159         lfd f0, lo16(.CPI_X_0)(r2)
    160         lis r2, ha16(.CPI_X_1)
    161         lfd f2, lo16(.CPI_X_1)(r2)
    162         fmadd f0, f1, f0, f2
    163         lis r2, ha16(.CPI_X_2)
    164         lfd f1, lo16(.CPI_X_2)(r2)
    165         lis r2, ha16(.CPI_X_3)
    166         lfd f2, lo16(.CPI_X_3)(r2)
    167         fmadd f1, f0, f1, f2
    168         blr
    169 
    170 It would be better to materialize .CPI_X into a register, then use immediates
    171 off of the register to avoid the lis's.  This is even more important in PIC 
    172 mode.
    173 
    174 Note that this (and the static variable version) is discussed here for GCC:
    175 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
    176 
    177 Here's another example (the sgn function):
    178 double testf(double a) {
    179        return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
    180 }
    181 
    182 it produces a BB like this:
    183 LBB1_1: ; cond_true
    184         lis r2, ha16(LCPI1_0)
    185         lfs f0, lo16(LCPI1_0)(r2)
    186         lis r2, ha16(LCPI1_1)
    187         lis r3, ha16(LCPI1_2)
    188         lfs f2, lo16(LCPI1_2)(r3)
    189         lfs f3, lo16(LCPI1_1)(r2)
    190         fsub f0, f0, f1
    191         fsel f1, f0, f2, f3
    192         blr 
    193 
    194 ===-------------------------------------------------------------------------===
    195 
    196 PIC Code Gen IPO optimization:
    197 
    198 Squish small scalar globals together into a single global struct, allowing the 
    199 address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
    200 of the GOT on targets with one).
    201 
    202 Note that this is discussed here for GCC:
    203 http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
    204 
    205 ===-------------------------------------------------------------------------===
    206 
    207 Implement Newton-Rhapson method for improving estimate instructions to the
    208 correct accuracy, and implementing divide as multiply by reciprocal when it has
    209 more than one use.  Itanium would want this too.
    210 
    211 ===-------------------------------------------------------------------------===
    212 
    213 Compile offsets from allocas:
    214 
    215 int *%test() {
    216         %X = alloca { int, int }
    217         %Y = getelementptr {int,int}* %X, int 0, uint 1
    218         ret int* %Y
    219 }
    220 
    221 into a single add, not two:
    222 
    223 _test:
    224         addi r2, r1, -8
    225         addi r3, r2, 4
    226         blr
    227 
    228 --> important for C++.
    229 
    230 ===-------------------------------------------------------------------------===
    231 
    232 No loads or stores of the constants should be needed:
    233 
    234 struct foo { double X, Y; };
    235 void xxx(struct foo F);
    236 void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
    237 
    238 ===-------------------------------------------------------------------------===
    239 
    240 Darwin Stub removal:
    241 
    242 We still generate calls to foo$stub, and stubs, on Darwin.  This is not
    243 necessary when building with the Leopard (10.5) or later linker, as stubs are
    244 generated by ld when necessary.  Parameterizing this based on the deployment
    245 target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
    246 its logic.
    247 
    248 ===-------------------------------------------------------------------------===
    249 
    250 Darwin Stub LICM optimization:
    251 
    252 Loops like this:
    253   
    254   for (...)  bar();
    255 
    256 Have to go through an indirect stub if bar is external or linkonce.  It would 
    257 be better to compile it as:
    258 
    259      fp = &bar;
    260      for (...)  fp();
    261 
    262 which only computes the address of bar once (instead of each time through the 
    263 stub).  This is Darwin specific and would have to be done in the code generator.
    264 Probably not a win on x86.
    265 
    266 ===-------------------------------------------------------------------------===
    267 
    268 Simple IPO for argument passing, change:
    269   void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
    270 
    271 the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
    272 of arguments get assigned to r3 through r10. That is, if you have a function
    273 foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
    274 argument bytes for r4 and r5. The trick then would be to shuffle the argument
    275 order for functions we can internalize so that the maximum number of 
    276 integers/pointers get passed in regs before you see any of the fp arguments.
    277 
    278 Instead of implementing this, it would actually probably be easier to just 
    279 implement a PPC fastcc, where we could do whatever we wanted to the CC, 
    280 including having this work sanely.
    281 
    282 ===-------------------------------------------------------------------------===
    283 
    284 Fix Darwin FP-In-Integer Registers ABI
    285 
    286 Darwin passes doubles in structures in integer registers, which is very very 
    287 bad.  Add something like a BITCAST to LLVM, then do an i-p transformation that
    288 percolates these things out of functions.
    289 
    290 Check out how horrible this is:
    291 http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
    292 
    293 This is an extension of "interprocedural CC unmunging" that can't be done with
    294 just fastcc.
    295 
    296 ===-------------------------------------------------------------------------===
    297 
    298 Compile this:
    299 
    300 int foo(int a) {
    301   int b = (a < 8);
    302   if (b) {
    303     return b * 3;     // ignore the fact that this is always 3.
    304   } else {
    305     return 2;
    306   }
    307 }
    308 
    309 into something not this:
    310 
    311 _foo:
    312 1)      cmpwi cr7, r3, 8
    313         mfcr r2, 1
    314         rlwinm r2, r2, 29, 31, 31
    315 1)      cmpwi cr0, r3, 7
    316         bgt cr0, LBB1_2 ; UnifiedReturnBlock
    317 LBB1_1: ; then
    318         rlwinm r2, r2, 0, 31, 31
    319         mulli r3, r2, 3
    320         blr
    321 LBB1_2: ; UnifiedReturnBlock
    322         li r3, 2
    323         blr
    324 
    325 In particular, the two compares (marked 1) could be shared by reversing one.
    326 This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
    327 same operands (but backwards) exists.  In this case, this wouldn't save us 
    328 anything though, because the compares still wouldn't be shared.
    329 
    330 ===-------------------------------------------------------------------------===
    331 
    332 We should custom expand setcc instead of pretending that we have it.  That
    333 would allow us to expose the access of the crbit after the mfcr, allowing
    334 that access to be trivially folded into other ops.  A simple example:
    335 
    336 int foo(int a, int b) { return (a < b) << 4; }
    337 
    338 compiles into:
    339 
    340 _foo:
    341         cmpw cr7, r3, r4
    342         mfcr r2, 1
    343         rlwinm r2, r2, 29, 31, 31
    344         slwi r3, r2, 4
    345         blr
    346 
    347 ===-------------------------------------------------------------------------===
    348 
    349 Fold add and sub with constant into non-extern, non-weak addresses so this:
    350 
    351 static int a;
    352 void bar(int b) { a = b; }
    353 void foo(unsigned char *c) {
    354   *c = a;
    355 }
    356 
    357 So that 
    358 
    359 _foo:
    360         lis r2, ha16(_a)
    361         la r2, lo16(_a)(r2)
    362         lbz r2, 3(r2)
    363         stb r2, 0(r3)
    364         blr
    365 
    366 Becomes
    367 
    368 _foo:
    369         lis r2, ha16(_a+3)
    370         lbz r2, lo16(_a+3)(r2)
    371         stb r2, 0(r3)
    372         blr
    373 
    374 ===-------------------------------------------------------------------------===
    375 
    376 We generate really bad code for this:
    377 
    378 int f(signed char *a, _Bool b, _Bool c) {
    379    signed char t = 0;
    380   if (b)  t = *a;
    381   if (c)  *a = t;
    382 }
    383 
    384 ===-------------------------------------------------------------------------===
    385 
    386 This:
    387 int test(unsigned *P) { return *P >> 24; }
    388 
    389 Should compile to:
    390 
    391 _test:
    392         lbz r3,0(r3)
    393         blr
    394 
    395 not:
    396 
    397 _test:
    398         lwz r2, 0(r3)
    399         srwi r3, r2, 24
    400         blr
    401 
    402 ===-------------------------------------------------------------------------===
    403 
    404 On the G5, logical CR operations are more expensive in their three
    405 address form: ops that read/write the same register are half as expensive as
    406 those that read from two registers that are different from their destination.
    407 
    408 We should model this with two separate instructions.  The isel should generate
    409 the "two address" form of the instructions.  When the register allocator 
    410 detects that it needs to insert a copy due to the two-addresness of the CR
    411 logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
    412 we can convert to the "three address" instruction, to save code space.
    413 
    414 This only matters when we start generating cr logical ops.
    415 
    416 ===-------------------------------------------------------------------------===
    417 
    418 We should compile these two functions to the same thing:
    419 
    420 #include <stdlib.h>
    421 void f(int a, int b, int *P) {
    422   *P = (a-b)>=0?(a-b):(b-a);
    423 }
    424 void g(int a, int b, int *P) {
    425   *P = abs(a-b);
    426 }
    427 
    428 Further, they should compile to something better than:
    429 
    430 _g:
    431         subf r2, r4, r3
    432         subfic r3, r2, 0
    433         cmpwi cr0, r2, -1
    434         bgt cr0, LBB2_2 ; entry
    435 LBB2_1: ; entry
    436         mr r2, r3
    437 LBB2_2: ; entry
    438         stw r2, 0(r5)
    439         blr
    440 
    441 GCC produces:
    442 
    443 _g:
    444         subf r4,r4,r3
    445         srawi r2,r4,31
    446         xor r0,r2,r4
    447         subf r0,r2,r0
    448         stw r0,0(r5)
    449         blr
    450 
    451 ... which is much nicer.
    452 
    453 This theoretically may help improve twolf slightly (used in dimbox.c:142?).
    454 
    455 ===-------------------------------------------------------------------------===
    456 
    457 PR5945: This: 
    458 define i32 @clamp0g(i32 %a) {
    459 entry:
    460         %cmp = icmp slt i32 %a, 0
    461         %sel = select i1 %cmp, i32 0, i32 %a
    462         ret i32 %sel
    463 }
    464 
    465 Is compile to this with the PowerPC (32-bit) backend:
    466 
    467 _clamp0g:
    468         cmpwi cr0, r3, 0
    469         li r2, 0
    470         blt cr0, LBB1_2
    471 ; BB#1:                                                     ; %entry
    472         mr r2, r3
    473 LBB1_2:                                                     ; %entry
    474         mr r3, r2
    475         blr
    476 
    477 This could be reduced to the much simpler:
    478 
    479 _clamp0g:
    480         srawi r2, r3, 31
    481         andc r3, r3, r2
    482         blr
    483 
    484 ===-------------------------------------------------------------------------===
    485 
    486 int foo(int N, int ***W, int **TK, int X) {
    487   int t, i;
    488   
    489   for (t = 0; t < N; ++t)
    490     for (i = 0; i < 4; ++i)
    491       W[t / X][i][t % X] = TK[i][t];
    492       
    493   return 5;
    494 }
    495 
    496 We generate relatively atrocious code for this loop compared to gcc.
    497 
    498 We could also strength reduce the rem and the div:
    499 http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
    500 
    501 ===-------------------------------------------------------------------------===
    502 
    503 float foo(float X) { return (int)(X); }
    504 
    505 Currently produces:
    506 
    507 _foo:
    508         fctiwz f0, f1
    509         stfd f0, -8(r1)
    510         lwz r2, -4(r1)
    511         extsw r2, r2
    512         std r2, -16(r1)
    513         lfd f0, -16(r1)
    514         fcfid f0, f0
    515         frsp f1, f0
    516         blr
    517 
    518 We could use a target dag combine to turn the lwz/extsw into an lwa when the 
    519 lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
    520 win only.
    521 
    522 ===-------------------------------------------------------------------------===
    523 
    524 We generate ugly code for this:
    525 
    526 void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
    527   unsigned code = 0;
    528   if(dx < -dw) code |= 1;
    529   if(dx > dw)  code |= 2;
    530   if(dy < -dw) code |= 4;
    531   if(dy > dw)  code |= 8;
    532   if(dz < -dw) code |= 16;
    533   if(dz > dw)  code |= 32;
    534   *ret = code;
    535 }
    536 
    537 ===-------------------------------------------------------------------------===
    538 
    539 Complete the signed i32 to FP conversion code using 64-bit registers
    540 transformation, good for PI.  See PPCISelLowering.cpp, this comment:
    541 
    542      // FIXME: disable this lowered code.  This generates 64-bit register values,
    543      // and we don't model the fact that the top part is clobbered by calls.  We
    544      // need to flag these together so that the value isn't live across a call.
    545      //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
    546 
    547 Also, if the registers are spilled to the stack, we have to ensure that all
    548 64-bits of them are save/restored, otherwise we will miscompile the code.  It
    549 sounds like we need to get the 64-bit register classes going.
    550 
    551 ===-------------------------------------------------------------------------===
    552 
    553 %struct.B = type { i8, [3 x i8] }
    554 
    555 define void @bar(%struct.B* %b) {
    556 entry:
    557         %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
    558         %tmp = load i32* %tmp          ; <uint> [#uses=1]
    559         %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
    560         %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
    561         %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
    562         %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
    563         %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
    564         %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
    565         %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
    566         %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
    567         %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
    568         %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
    569         store i32 %tmp13, i32* %tmp8
    570         ret void
    571 }
    572 
    573 We emit:
    574 
    575 _foo:
    576         lwz r2, 0(r3)
    577         slwi r4, r2, 1
    578         or r4, r4, r2
    579         rlwimi r2, r4, 0, 0, 0
    580         stw r2, 0(r3)
    581         blr
    582 
    583 We could collapse a bunch of those ORs and ANDs and generate the following
    584 equivalent code:
    585 
    586 _foo:
    587         lwz r2, 0(r3)
    588         rlwinm r4, r2, 1, 0, 0
    589         or r2, r2, r4
    590         stw r2, 0(r3)
    591         blr
    592 
    593 ===-------------------------------------------------------------------------===
    594 
    595 We compile:
    596 
    597 unsigned test6(unsigned x) { 
    598   return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
    599 }
    600 
    601 into:
    602 
    603 _test6:
    604         lis r2, 255
    605         rlwinm r3, r3, 16, 0, 31
    606         ori r2, r2, 255
    607         and r3, r3, r2
    608         blr
    609 
    610 GCC gets it down to:
    611 
    612 _test6:
    613         rlwinm r0,r3,16,8,15
    614         rlwinm r3,r3,16,24,31
    615         or r3,r3,r0
    616         blr
    617 
    618 
    619 ===-------------------------------------------------------------------------===
    620 
    621 Consider a function like this:
    622 
    623 float foo(float X) { return X + 1234.4123f; }
    624 
    625 The FP constant ends up in the constant pool, so we need to get the LR register.
    626  This ends up producing code like this:
    627 
    628 _foo:
    629 .LBB_foo_0:     ; entry
    630         mflr r11
    631 ***     stw r11, 8(r1)
    632         bl "L00000$pb"
    633 "L00000$pb":
    634         mflr r2
    635         addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
    636         lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
    637         fadds f1, f1, f0
    638 ***     lwz r11, 8(r1)
    639         mtlr r11
    640         blr
    641 
    642 This is functional, but there is no reason to spill the LR register all the way
    643 to the stack (the two marked instrs): spilling it to a GPR is quite enough.
    644 
    645 Implementing this will require some codegen improvements.  Nate writes:
    646 
    647 "So basically what we need to support the "no stack frame save and restore" is a
    648 generalization of the LR optimization to "callee-save regs".
    649 
    650 Currently, we have LR marked as a callee-save reg.  The register allocator sees
    651 that it's callee save, and spills it directly to the stack.
    652 
    653 Ideally, something like this would happen:
    654 
    655 LR would be in a separate register class from the GPRs. The class of LR would be
    656 marked "unspillable".  When the register allocator came across an unspillable
    657 reg, it would ask "what is the best class to copy this into that I *can* spill"
    658 If it gets a class back, which it will in this case (the gprs), it grabs a free
    659 register of that class.  If it is then later necessary to spill that reg, so be
    660 it.
    661 
    662 ===-------------------------------------------------------------------------===
    663 
    664 We compile this:
    665 int test(_Bool X) {
    666   return X ? 524288 : 0;
    667 }
    668 
    669 to: 
    670 _test:
    671         cmplwi cr0, r3, 0
    672         lis r2, 8
    673         li r3, 0
    674         beq cr0, LBB1_2 ;entry
    675 LBB1_1: ;entry
    676         mr r3, r2
    677 LBB1_2: ;entry
    678         blr 
    679 
    680 instead of:
    681 _test:
    682         addic r2,r3,-1
    683         subfe r0,r2,r3
    684         slwi r3,r0,19
    685         blr
    686 
    687 This sort of thing occurs a lot due to globalopt.
    688 
    689 ===-------------------------------------------------------------------------===
    690 
    691 We compile:
    692 
    693 define i32 @bar(i32 %x) nounwind readnone ssp {
    694 entry:
    695   %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
    696   %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
    697   ret i32 %neg
    698 }
    699 
    700 to:
    701 
    702 _bar:
    703 	cntlzw r2, r3
    704 	slwi r2, r2, 26
    705 	srawi r3, r2, 31
    706 	blr 
    707 
    708 it would be better to produce:
    709 
    710 _bar: 
    711         addic r3,r3,-1
    712         subfe r3,r3,r3
    713         blr
    714 
    715 ===-------------------------------------------------------------------------===
    716 
    717 We currently compile 32-bit bswap:
    718 
    719 declare i32 @llvm.bswap.i32(i32 %A)
    720 define i32 @test(i32 %A) {
    721         %B = call i32 @llvm.bswap.i32(i32 %A)
    722         ret i32 %B
    723 }
    724 
    725 to:
    726 
    727 _test:
    728         rlwinm r2, r3, 24, 16, 23
    729         slwi r4, r3, 24
    730         rlwimi r2, r3, 8, 24, 31
    731         rlwimi r4, r3, 8, 8, 15
    732         rlwimi r4, r2, 0, 16, 31
    733         mr r3, r4
    734         blr 
    735 
    736 it would be more efficient to produce:
    737 
    738 _foo:   mr r0,r3
    739         rlwinm r3,r3,8,0xffffffff
    740         rlwimi r3,r0,24,0,7
    741         rlwimi r3,r0,24,16,23
    742         blr
    743 
    744 ===-------------------------------------------------------------------------===
    745 
    746 test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
    747 
    748 __ZNK4llvm5APInt17countLeadingZerosEv:
    749         ld r2, 0(r3)
    750         cntlzd r2, r2
    751         or r2, r2, r2     <<-- silly.
    752         addi r3, r2, -64
    753         blr 
    754 
    755 The dead or is a 'truncate' from 64- to 32-bits.
    756 
    757 ===-------------------------------------------------------------------------===
    758 
    759 We generate horrible ppc code for this:
    760 
    761 #define N  2000000
    762 double   a[N],c[N];
    763 void simpleloop() {
    764    int j;
    765    for (j=0; j<N; j++)
    766      c[j] = a[j];
    767 }
    768 
    769 LBB1_1: ;bb
    770         lfdx f0, r3, r4
    771         addi r5, r5, 1                 ;; Extra IV for the exit value compare.
    772         stfdx f0, r2, r4
    773         addi r4, r4, 8
    774 
    775         xoris r6, r5, 30               ;; This is due to a large immediate.
    776         cmplwi cr0, r6, 33920
    777         bne cr0, LBB1_1
    778 
    779 //===---------------------------------------------------------------------===//
    780 
    781 This:
    782         #include <algorithm>
    783         inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
    784         { return std::make_pair(a + b, a + b < a); }
    785         bool no_overflow(unsigned a, unsigned b)
    786         { return !full_add(a, b).second; }
    787 
    788 Should compile to:
    789 
    790 __Z11no_overflowjj:
    791         add r4,r3,r4
    792         subfc r3,r3,r4
    793         li r3,0
    794         adde r3,r3,r3
    795         blr
    796 
    797 (or better) not:
    798 
    799 __Z11no_overflowjj:
    800         add r2, r4, r3
    801         cmplw cr7, r2, r3
    802         mfcr r2
    803         rlwinm r2, r2, 29, 31, 31
    804         xori r3, r2, 1
    805         blr 
    806 
    807 //===---------------------------------------------------------------------===//
    808 
    809 We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
    810 example:
    811 #include <math.h>
    812 int test(double x, double y) { return islessequal(x, y);}
    813 int test2(double x, double y) {  return islessgreater(x, y);}
    814 int test3(double x, double y) {  return !islessequal(x, y);}
    815 
    816 Compiles into (all three are similar, but the bits differ):
    817 
    818 _test:
    819 	fcmpu cr7, f1, f2
    820 	mfcr r2
    821 	rlwinm r3, r2, 29, 31, 31
    822 	rlwinm r2, r2, 31, 31, 31
    823 	or r3, r2, r3
    824 	blr 
    825 
    826 GCC compiles this into:
    827 
    828  _test:
    829 	fcmpu cr7,f1,f2
    830 	cror 30,28,30
    831 	mfcr r3
    832 	rlwinm r3,r3,31,1
    833 	blr
    834         
    835 which is more efficient and can use mfocr.  See PR642 for some more context.
    836 
    837 //===---------------------------------------------------------------------===//
    838 
    839 void foo(float *data, float d) {
    840    long i;
    841    for (i = 0; i < 8000; i++)
    842       data[i] = d;
    843 }
    844 void foo2(float *data, float d) {
    845    long i;
    846    data--;
    847    for (i = 0; i < 8000; i++) {
    848       data[1] = d;
    849       data++;
    850    }
    851 }
    852 
    853 These compile to:
    854 
    855 _foo:
    856 	li r2, 0
    857 LBB1_1:	; bb
    858 	addi r4, r2, 4
    859 	stfsx f1, r3, r2
    860 	cmplwi cr0, r4, 32000
    861 	mr r2, r4
    862 	bne cr0, LBB1_1	; bb
    863 	blr 
    864 _foo2:
    865 	li r2, 0
    866 LBB2_1:	; bb
    867 	addi r4, r2, 4
    868 	stfsx f1, r3, r2
    869 	cmplwi cr0, r4, 32000
    870 	mr r2, r4
    871 	bne cr0, LBB2_1	; bb
    872 	blr 
    873 
    874 The 'mr' could be eliminated to folding the add into the cmp better.
    875 
    876 //===---------------------------------------------------------------------===//
    877 Codegen for the following (low-probability) case deteriorated considerably 
    878 when the correctness fixes for unordered comparisons went in (PR 642, 58871).
    879 It should be possible to recover the code quality described in the comments.
    880 
    881 ; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
    882 ; This should produce one 'or' or 'cror' instruction per function.
    883 
    884 ; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
    885 ; PR2964
    886 
    887 define i32 @test(double %x, double %y) nounwind  {
    888 entry:
    889 	%tmp3 = fcmp ole double %x, %y		; <i1> [#uses=1]
    890 	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
    891 	ret i32 %tmp345
    892 }
    893 
    894 define i32 @test2(double %x, double %y) nounwind  {
    895 entry:
    896 	%tmp3 = fcmp one double %x, %y		; <i1> [#uses=1]
    897 	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
    898 	ret i32 %tmp345
    899 }
    900 
    901 define i32 @test3(double %x, double %y) nounwind  {
    902 entry:
    903 	%tmp3 = fcmp ugt double %x, %y		; <i1> [#uses=1]
    904 	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
    905 	ret i32 %tmp34
    906 }
    907 //===----------------------------------------------------------------------===//
    908 ; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
    909 
    910 ; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and 
    911 ; should not be generated except with -enable-finite-only-fp-math or the like).
    912 ; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
    913 ; recognize a more elaborate tree than a simple SETxx.
    914 
    915 define double @test_FNEG_sel(double %A, double %B, double %C) {
    916         %D = fsub double -0.000000e+00, %A               ; <double> [#uses=1]
    917         %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
    918         %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
    919         ret double %E
    920 }
    921 
    922 //===----------------------------------------------------------------------===//
    923 The save/restore sequence for CR in prolog/epilog is terrible:
    924 - Each CR subreg is saved individually, rather than doing one save as a unit.
    925 - On Darwin, the save is done after the decrement of SP, which means the offset
    926 from SP of the save slot can be too big for a store instruction, which means we
    927 need an additional register (currently hacked in 96015+96020; the solution there
    928 is correct, but poor).
    929 - On SVR4 the same thing can happen, and I don't think saving before the SP
    930 decrement is safe on that target, as there is no red zone.  This is currently
    931 broken AFAIK, although it's not a target I can exercise.
    932 The following demonstrates the problem:
    933 extern void bar(char *p);
    934 void foo() {
    935   char x[100000];
    936   bar(x);
    937   __asm__("" ::: "cr2");
    938 }
    939 

README_ALTIVEC.txt

      1 //===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
      2 
      3 Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
      4 registers, to generate better spill code.
      5 
      6 //===----------------------------------------------------------------------===//
      7 
      8 The first should be a single lvx from the constant pool, the second should be 
      9 a xor/stvx:
     10 
     11 void foo(void) {
     12   int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
     13   bar (x);
     14 }
     15 
     16 #include <string.h>
     17 void foo(void) {
     18   int x[8] __attribute__((aligned(128)));
     19   memset (x, 0, sizeof (x));
     20   bar (x);
     21 }
     22 
     23 //===----------------------------------------------------------------------===//
     24 
     25 Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
     26 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
     27 
     28 When -ffast-math is on, we can use 0.0.
     29 
     30 //===----------------------------------------------------------------------===//
     31 
     32   Consider this:
     33   v4f32 Vector;
     34   v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
     35 
     36 Since we know that "Vector" is 16-byte aligned and we know the element offset 
     37 of ".X", we should change the load into a lve*x instruction, instead of doing
     38 a load/store/lve*x sequence.
     39 
     40 //===----------------------------------------------------------------------===//
     41 
     42 For functions that use altivec AND have calls, we are VRSAVE'ing all call
     43 clobbered regs.
     44 
     45 //===----------------------------------------------------------------------===//
     46 
     47 Implement passing vectors by value into calls and receiving them as arguments.
     48 
     49 //===----------------------------------------------------------------------===//
     50 
     51 GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
     52 of C1/C2/C3, then a load and vperm of Variable.
     53 
     54 //===----------------------------------------------------------------------===//
     55 
     56 We need a way to teach tblgen that some operands of an intrinsic are required to
     57 be constants.  The verifier should enforce this constraint.
     58 
     59 //===----------------------------------------------------------------------===//
     60 
     61 We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
     62 aligned stack slot, followed by a load/vperm.  We should probably just store it
     63 to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
     64 in memory this is a big win.
     65 
     66 //===----------------------------------------------------------------------===//
     67 
     68 extract_vector_elt of an arbitrary constant vector can be done with the 
     69 following instructions:
     70 
     71 vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
     72 vec_ste(&destloc,0,vTemp);
     73 
     74 We can do an arbitrary non-constant value by using lvsr/perm/ste.
     75 
     76 //===----------------------------------------------------------------------===//
     77 
     78 If we want to tie instruction selection into the scheduler, we can do some
     79 constant formation with different instructions.  For example, we can generate
     80 "vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
     81 "vsplti 0" or "vxor", each of which use different execution units, thus could
     82 help scheduling.
     83 
     84 This is probably only reasonable for a post-pass scheduler.
     85 
     86 //===----------------------------------------------------------------------===//
     87 
     88 For this function:
     89 
     90 void test(vector float *A, vector float *B) {
     91   vector float C = (vector float)vec_cmpeq(*A, *B);
     92   if (!vec_any_eq(*A, *B))
     93     *B = (vector float){0,0,0,0};
     94   *A = C;
     95 }
     96 
     97 we get the following basic block:
     98 
     99 	...
    100         lvx v2, 0, r4
    101         lvx v3, 0, r3
    102         vcmpeqfp v4, v3, v2
    103         vcmpeqfp. v2, v3, v2
    104         bne cr6, LBB1_2 ; cond_next
    105 
    106 The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
    107 vcmpeqfp. result is used by a branch.  This can be improved.
    108 
    109 //===----------------------------------------------------------------------===//
    110 
    111 The code generated for this is truly aweful:
    112 
    113 vector float test(float a, float b) {
    114  return (vector float){ 0.0, a, 0.0, 0.0}; 
    115 }
    116 
    117 LCPI1_0:                                        ;  float
    118         .space  4
    119         .text
    120         .globl  _test
    121         .align  4
    122 _test:
    123         mfspr r2, 256
    124         oris r3, r2, 4096
    125         mtspr 256, r3
    126         lis r3, ha16(LCPI1_0)
    127         addi r4, r1, -32
    128         stfs f1, -16(r1)
    129         addi r5, r1, -16
    130         lfs f0, lo16(LCPI1_0)(r3)
    131         stfs f0, -32(r1)
    132         lvx v2, 0, r4
    133         lvx v3, 0, r5
    134         vmrghw v3, v3, v2
    135         vspltw v2, v2, 0
    136         vmrghw v2, v2, v3
    137         mtspr 256, r2
    138         blr
    139 
    140 //===----------------------------------------------------------------------===//
    141 
    142 int foo(vector float *x, vector float *y) {
    143         if (vec_all_eq(*x,*y)) return 3245; 
    144         else return 12;
    145 }
    146 
    147 A predicate compare being used in a select_cc should have the same peephole
    148 applied to it as a predicate compare used by a br_cc.  There should be no
    149 mfcr here:
    150 
    151 _foo:
    152         mfspr r2, 256
    153         oris r5, r2, 12288
    154         mtspr 256, r5
    155         li r5, 12
    156         li r6, 3245
    157         lvx v2, 0, r4
    158         lvx v3, 0, r3
    159         vcmpeqfp. v2, v3, v2
    160         mfcr r3, 2
    161         rlwinm r3, r3, 25, 31, 31
    162         cmpwi cr0, r3, 0
    163         bne cr0, LBB1_2 ; entry
    164 LBB1_1: ; entry
    165         mr r6, r5
    166 LBB1_2: ; entry
    167         mr r3, r6
    168         mtspr 256, r2
    169         blr
    170 
    171 //===----------------------------------------------------------------------===//
    172 
    173 CodeGen/PowerPC/vec_constants.ll has an and operation that should be
    174 codegen'd to andc.  The issue is that the 'all ones' build vector is
    175 SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
    176 which prevents the vnot pattern from matching.
    177 
    178 
    179 //===----------------------------------------------------------------------===//
    180 
    181 An alternative to the store/store/load approach for illegal insert element 
    182 lowering would be:
    183 
    184 1. store element to any ol' slot
    185 2. lvx the slot
    186 3. lvsl 0; splat index; vcmpeq to generate a select mask
    187 4. lvsl slot + x; vperm to rotate result into correct slot
    188 5. vsel result together.
    189 
    190 //===----------------------------------------------------------------------===//
    191 
    192 Should codegen branches on vec_any/vec_all to avoid mfcr.  Two examples:
    193 
    194 #include <altivec.h>
    195  int f(vector float a, vector float b)
    196  {
    197   int aa = 0;
    198   if (vec_all_ge(a, b))
    199     aa |= 0x1;
    200   if (vec_any_ge(a,b))
    201     aa |= 0x2;
    202   return aa;
    203 }
    204 
    205 vector float f(vector float a, vector float b) { 
    206   if (vec_any_eq(a, b)) 
    207     return a; 
    208   else 
    209     return b; 
    210 }
    211 
    212