Home | History | Annotate | Download | only in ARM
      1 //===---------------------------------------------------------------------===//
      2 // Random ideas for the ARM backend (Thumb specific).
      3 //===---------------------------------------------------------------------===//
      4 
      5 * Add support for compiling functions in both ARM and Thumb mode, then taking
      6   the smallest.
      7 
      8 * Add support for compiling individual basic blocks in thumb mode, when in a 
      9   larger ARM function.  This can be used for presumed cold code, like paths
     10   to abort (failure path of asserts), EH handling code, etc.
     11 
     12 * Thumb doesn't have normal pre/post increment addressing modes, but you can
     13   load/store 32-bit integers with pre/postinc by using load/store multiple
     14   instrs with a single register.
     15 
     16 * Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
     17   and cmp instructions can use high registers. Also, we can use them as
     18   temporaries to spill values into.
     19 
     20 * In thumb mode, short, byte, and bool preferred alignments are currently set
     21   to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
     22   of 4).
     23 
     24 //===---------------------------------------------------------------------===//
     25 
     26 Potential jumptable improvements:
     27 
     28 * If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
     29   jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
     30   function is even smaller. This also applies to ARM.
     31 
     32 * Thumb jumptable codegen can improve given some help from the assembler. This
     33   is what we generate right now:
     34 
     35 	.set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
     36 LPCRELL0:
     37 	mov r1, #PCRELV0
     38 	add r1, pc
     39 	ldr r0, [r0, r1]
     40 	mov pc, r0 
     41 	.align	2
     42 LJTI1_0_0:
     43 	.long	 LBB1_3
     44         ...
     45 
     46 Note there is another pc relative add that we can take advantage of.
     47      add r1, pc, #imm_8 * 4
     48 
     49 We should be able to generate:
     50 
     51 LPCRELL0:
     52 	add r1, LJTI1_0_0
     53 	ldr r0, [r0, r1]
     54 	mov pc, r0 
     55 	.align	2
     56 LJTI1_0_0:
     57 	.long	 LBB1_3
     58 
     59 if the assembler can translate the add to:
     60        add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
     61 
     62 Note the assembler also does something similar to constpool load:
     63 LPCRELL0:
     64      ldr r0, LCPI1_0
     65 =>
     66      ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
     67 
     68 
     69 //===---------------------------------------------------------------------===//
     70 
     71 We compile the following:
     72 
     73 define i16 @func_entry_2E_ce(i32 %i) {
     74         switch i32 %i, label %bb12.exitStub [
     75                  i32 0, label %bb4.exitStub
     76                  i32 1, label %bb9.exitStub
     77                  i32 2, label %bb4.exitStub
     78                  i32 3, label %bb4.exitStub
     79                  i32 7, label %bb9.exitStub
     80                  i32 8, label %bb.exitStub
     81                  i32 9, label %bb9.exitStub
     82         ]
     83 
     84 bb12.exitStub:
     85         ret i16 0
     86 
     87 bb4.exitStub:
     88         ret i16 1
     89 
     90 bb9.exitStub:
     91         ret i16 2
     92 
     93 bb.exitStub:
     94         ret i16 3
     95 }
     96 
     97 into:
     98 
     99 _func_entry_2E_ce:
    100         mov r2, #1
    101         lsl r2, r0
    102         cmp r0, #9
    103         bhi LBB1_4      @bb12.exitStub
    104 LBB1_1: @newFuncRoot
    105         mov r1, #13
    106         tst r2, r1
    107         bne LBB1_5      @bb4.exitStub
    108 LBB1_2: @newFuncRoot
    109         ldr r1, LCPI1_0
    110         tst r2, r1
    111         bne LBB1_6      @bb9.exitStub
    112 LBB1_3: @newFuncRoot
    113         mov r1, #1
    114         lsl r1, r1, #8
    115         tst r2, r1
    116         bne LBB1_7      @bb.exitStub
    117 LBB1_4: @bb12.exitStub
    118         mov r0, #0
    119         bx lr
    120 LBB1_5: @bb4.exitStub
    121         mov r0, #1
    122         bx lr
    123 LBB1_6: @bb9.exitStub
    124         mov r0, #2
    125         bx lr
    126 LBB1_7: @bb.exitStub
    127         mov r0, #3
    128         bx lr
    129 LBB1_8:
    130         .align  2
    131 LCPI1_0:
    132         .long   642
    133 
    134 
    135 gcc compiles to:
    136 
    137 	cmp	r0, #9
    138 	@ lr needed for prologue
    139 	bhi	L2
    140 	ldr	r3, L11
    141 	mov	r2, #1
    142 	mov	r1, r2, asl r0
    143 	ands	r0, r3, r2, asl r0
    144 	movne	r0, #2
    145 	bxne	lr
    146 	tst	r1, #13
    147 	beq	L9
    148 L3:
    149 	mov	r0, r2
    150 	bx	lr
    151 L9:
    152 	tst	r1, #256
    153 	movne	r0, #3
    154 	bxne	lr
    155 L2:
    156 	mov	r0, #0
    157 	bx	lr
    158 L12:
    159 	.align 2
    160 L11:
    161 	.long	642
    162         
    163 
    164 GCC is doing a couple of clever things here:
    165   1. It is predicating one of the returns.  This isn't a clear win though: in
    166      cases where that return isn't taken, it is replacing one condbranch with
    167      two 'ne' predicated instructions.
    168   2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
    169      tst.  This will probably require whole function isel.
    170   3. GCC emits:
    171   	tst	r1, #256
    172      we emit:
    173         mov r1, #1
    174         lsl r1, r1, #8
    175         tst r2, r1
    176   
    177 
    178 //===---------------------------------------------------------------------===//
    179 
    180 When spilling in thumb mode and the sp offset is too large to fit in the ldr /
    181 str offset field, we load the offset from a constpool entry and add it to sp:
    182 
    183 ldr r2, LCPI
    184 add r2, sp
    185 ldr r2, [r2]
    186 
    187 These instructions preserve the condition code which is important if the spill
    188 is between a cmp and a bcc instruction. However, we can use the (potentially)
    189 cheaper sequnce if we know it's ok to clobber the condition register.
    190 
    191 add r2, sp, #255 * 4
    192 add r2, #132
    193 ldr r2, [r2, #7 * 4]
    194 
    195 This is especially bad when dynamic alloca is used. The all fixed size stack
    196 objects are referenced off the frame pointer with negative offsets. See
    197 oggenc for an example.
    198 
    199 
    200 //===---------------------------------------------------------------------===//
    201 
    202 Poor codegen test/CodeGen/ARM/select.ll f7:
    203 
    204 	ldr r5, LCPI1_0
    205 LPC0:
    206 	add r5, pc
    207 	ldr r6, LCPI1_1
    208 	ldr r2, LCPI1_2
    209 	mov r3, r6
    210 	mov lr, pc
    211 	bx r5
    212 
    213 //===---------------------------------------------------------------------===//
    214 
    215 Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
    216 etc. Almost all Thumb instructions clobber condition code.
    217 
    218 //===---------------------------------------------------------------------===//
    219 
    220 Add ldmia, stmia support.
    221 
    222 //===---------------------------------------------------------------------===//
    223 
    224 Thumb load / store address mode offsets are scaled. The values kept in the
    225 instruction operands are pre-scale values. This probably ought to be changed
    226 to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
    227 
    228 //===---------------------------------------------------------------------===//
    229 
    230 We need to make (some of the) Thumb1 instructions predicable. That will allow
    231 shrinking of predicated Thumb2 instructions. To allow this, we need to be able
    232 to toggle the 's' bit since they do not set CPSR when they are inside IT blocks.
    233 
    234 //===---------------------------------------------------------------------===//
    235 
    236 Make use of hi register variants of cmp: tCMPhir / tCMPZhir.
    237 
    238 //===---------------------------------------------------------------------===//
    239 
    240 Thumb1 immediate field sometimes keep pre-scaled values. See
    241 Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and
    242 Thumb2.
    243 
    244 //===---------------------------------------------------------------------===//
    245 
    246 Rather than having tBR_JTr print a ".align 2" and constant island pass pad it,
    247 add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes
    248 won't have to over-estimate. It can also be used for loop alignment pass.
    249 
    250 //===---------------------------------------------------------------------===//
    251 
    252 We generate conditional code for icmp when we don't need to. This code:
    253 
    254   int foo(int s) {
    255     return s == 1;
    256   }
    257 
    258 produces:
    259 
    260 foo:
    261         cmp     r0, #1
    262         mov.w   r0, #0
    263         it      eq
    264         moveq   r0, #1
    265         bx      lr
    266 
    267 when it could use subs + adcs. This is GCC PR46975.
    268