1 //===---------------------------------------------------------------------===// 2 // Random ideas for the ARM backend (Thumb specific). 3 //===---------------------------------------------------------------------===// 4 5 * Add support for compiling functions in both ARM and Thumb mode, then taking 6 the smallest. 7 8 * Add support for compiling individual basic blocks in thumb mode, when in a 9 larger ARM function. This can be used for presumed cold code, like paths 10 to abort (failure path of asserts), EH handling code, etc. 11 12 * Thumb doesn't have normal pre/post increment addressing modes, but you can 13 load/store 32-bit integers with pre/postinc by using load/store multiple 14 instrs with a single register. 15 16 * Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add 17 and cmp instructions can use high registers. Also, we can use them as 18 temporaries to spill values into. 19 20 * In thumb mode, short, byte, and bool preferred alignments are currently set 21 to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple 22 of 4). 23 24 //===---------------------------------------------------------------------===// 25 26 Potential jumptable improvements: 27 28 * If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit 29 jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the 30 function is even smaller. This also applies to ARM. 31 32 * Thumb jumptable codegen can improve given some help from the assembler. This 33 is what we generate right now: 34 35 .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4)) 36 LPCRELL0: 37 mov r1, #PCRELV0 38 add r1, pc 39 ldr r0, [r0, r1] 40 mov pc, r0 41 .align 2 42 LJTI1_0_0: 43 .long LBB1_3 44 ... 45 46 Note there is another pc relative add that we can take advantage of. 47 add r1, pc, #imm_8 * 4 48 49 We should be able to generate: 50 51 LPCRELL0: 52 add r1, LJTI1_0_0 53 ldr r0, [r0, r1] 54 mov pc, r0 55 .align 2 56 LJTI1_0_0: 57 .long LBB1_3 58 59 if the assembler can translate the add to: 60 add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc) 61 62 Note the assembler also does something similar to constpool load: 63 LPCRELL0: 64 ldr r0, LCPI1_0 65 => 66 ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc) 67 68 69 //===---------------------------------------------------------------------===// 70 71 We compile the following: 72 73 define i16 @func_entry_2E_ce(i32 %i) { 74 switch i32 %i, label %bb12.exitStub [ 75 i32 0, label %bb4.exitStub 76 i32 1, label %bb9.exitStub 77 i32 2, label %bb4.exitStub 78 i32 3, label %bb4.exitStub 79 i32 7, label %bb9.exitStub 80 i32 8, label %bb.exitStub 81 i32 9, label %bb9.exitStub 82 ] 83 84 bb12.exitStub: 85 ret i16 0 86 87 bb4.exitStub: 88 ret i16 1 89 90 bb9.exitStub: 91 ret i16 2 92 93 bb.exitStub: 94 ret i16 3 95 } 96 97 into: 98 99 _func_entry_2E_ce: 100 mov r2, #1 101 lsl r2, r0 102 cmp r0, #9 103 bhi LBB1_4 @bb12.exitStub 104 LBB1_1: @newFuncRoot 105 mov r1, #13 106 tst r2, r1 107 bne LBB1_5 @bb4.exitStub 108 LBB1_2: @newFuncRoot 109 ldr r1, LCPI1_0 110 tst r2, r1 111 bne LBB1_6 @bb9.exitStub 112 LBB1_3: @newFuncRoot 113 mov r1, #1 114 lsl r1, r1, #8 115 tst r2, r1 116 bne LBB1_7 @bb.exitStub 117 LBB1_4: @bb12.exitStub 118 mov r0, #0 119 bx lr 120 LBB1_5: @bb4.exitStub 121 mov r0, #1 122 bx lr 123 LBB1_6: @bb9.exitStub 124 mov r0, #2 125 bx lr 126 LBB1_7: @bb.exitStub 127 mov r0, #3 128 bx lr 129 LBB1_8: 130 .align 2 131 LCPI1_0: 132 .long 642 133 134 135 gcc compiles to: 136 137 cmp r0, #9 138 @ lr needed for prologue 139 bhi L2 140 ldr r3, L11 141 mov r2, #1 142 mov r1, r2, asl r0 143 ands r0, r3, r2, asl r0 144 movne r0, #2 145 bxne lr 146 tst r1, #13 147 beq L9 148 L3: 149 mov r0, r2 150 bx lr 151 L9: 152 tst r1, #256 153 movne r0, #3 154 bxne lr 155 L2: 156 mov r0, #0 157 bx lr 158 L12: 159 .align 2 160 L11: 161 .long 642 162 163 164 GCC is doing a couple of clever things here: 165 1. It is predicating one of the returns. This isn't a clear win though: in 166 cases where that return isn't taken, it is replacing one condbranch with 167 two 'ne' predicated instructions. 168 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of 169 tst. This will probably require whole function isel. 170 3. GCC emits: 171 tst r1, #256 172 we emit: 173 mov r1, #1 174 lsl r1, r1, #8 175 tst r2, r1 176 177 178 //===---------------------------------------------------------------------===// 179 180 When spilling in thumb mode and the sp offset is too large to fit in the ldr / 181 str offset field, we load the offset from a constpool entry and add it to sp: 182 183 ldr r2, LCPI 184 add r2, sp 185 ldr r2, [r2] 186 187 These instructions preserve the condition code which is important if the spill 188 is between a cmp and a bcc instruction. However, we can use the (potentially) 189 cheaper sequnce if we know it's ok to clobber the condition register. 190 191 add r2, sp, #255 * 4 192 add r2, #132 193 ldr r2, [r2, #7 * 4] 194 195 This is especially bad when dynamic alloca is used. The all fixed size stack 196 objects are referenced off the frame pointer with negative offsets. See 197 oggenc for an example. 198 199 200 //===---------------------------------------------------------------------===// 201 202 Poor codegen test/CodeGen/ARM/select.ll f7: 203 204 ldr r5, LCPI1_0 205 LPC0: 206 add r5, pc 207 ldr r6, LCPI1_1 208 ldr r2, LCPI1_2 209 mov r3, r6 210 mov lr, pc 211 bx r5 212 213 //===---------------------------------------------------------------------===// 214 215 Make register allocator / spiller smarter so we can re-materialize "mov r, imm", 216 etc. Almost all Thumb instructions clobber condition code. 217 218 //===---------------------------------------------------------------------===// 219 220 Add ldmia, stmia support. 221 222 //===---------------------------------------------------------------------===// 223 224 Thumb load / store address mode offsets are scaled. The values kept in the 225 instruction operands are pre-scale values. This probably ought to be changed 226 to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions. 227 228 //===---------------------------------------------------------------------===// 229 230 We need to make (some of the) Thumb1 instructions predicable. That will allow 231 shrinking of predicated Thumb2 instructions. To allow this, we need to be able 232 to toggle the 's' bit since they do not set CPSR when they are inside IT blocks. 233 234 //===---------------------------------------------------------------------===// 235 236 Make use of hi register variants of cmp: tCMPhir / tCMPZhir. 237 238 //===---------------------------------------------------------------------===// 239 240 Thumb1 immediate field sometimes keep pre-scaled values. See 241 Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and 242 Thumb2. 243 244 //===---------------------------------------------------------------------===// 245 246 Rather than having tBR_JTr print a ".align 2" and constant island pass pad it, 247 add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes 248 won't have to over-estimate. It can also be used for loop alignment pass. 249 250 //===---------------------------------------------------------------------===// 251 252 We generate conditional code for icmp when we don't need to. This code: 253 254 int foo(int s) { 255 return s == 1; 256 } 257 258 produces: 259 260 foo: 261 cmp r0, #1 262 mov.w r0, #0 263 it eq 264 moveq r0, #1 265 bx lr 266 267 when it could use subs + adcs. This is GCC PR46975. 268