1 /* 2 * Copyright 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "util/ralloc.h" 25 #include "util/register_allocate.h" 26 #include "vc4_context.h" 27 #include "vc4_qir.h" 28 #include "vc4_qpu.h" 29 30 #define QPU_R(file, index) { QPU_MUX_##file, index } 31 32 static const struct qpu_reg vc4_regs[] = { 33 { QPU_MUX_R0, 0}, 34 { QPU_MUX_R1, 0}, 35 { QPU_MUX_R2, 0}, 36 { QPU_MUX_R3, 0}, 37 { QPU_MUX_R4, 0}, 38 QPU_R(A, 0), 39 QPU_R(B, 0), 40 QPU_R(A, 1), 41 QPU_R(B, 1), 42 QPU_R(A, 2), 43 QPU_R(B, 2), 44 QPU_R(A, 3), 45 QPU_R(B, 3), 46 QPU_R(A, 4), 47 QPU_R(B, 4), 48 QPU_R(A, 5), 49 QPU_R(B, 5), 50 QPU_R(A, 6), 51 QPU_R(B, 6), 52 QPU_R(A, 7), 53 QPU_R(B, 7), 54 QPU_R(A, 8), 55 QPU_R(B, 8), 56 QPU_R(A, 9), 57 QPU_R(B, 9), 58 QPU_R(A, 10), 59 QPU_R(B, 10), 60 QPU_R(A, 11), 61 QPU_R(B, 11), 62 QPU_R(A, 12), 63 QPU_R(B, 12), 64 QPU_R(A, 13), 65 QPU_R(B, 13), 66 QPU_R(A, 14), 67 QPU_R(B, 14), 68 QPU_R(A, 15), 69 QPU_R(B, 15), 70 QPU_R(A, 16), 71 QPU_R(B, 16), 72 QPU_R(A, 17), 73 QPU_R(B, 17), 74 QPU_R(A, 18), 75 QPU_R(B, 18), 76 QPU_R(A, 19), 77 QPU_R(B, 19), 78 QPU_R(A, 20), 79 QPU_R(B, 20), 80 QPU_R(A, 21), 81 QPU_R(B, 21), 82 QPU_R(A, 22), 83 QPU_R(B, 22), 84 QPU_R(A, 23), 85 QPU_R(B, 23), 86 QPU_R(A, 24), 87 QPU_R(B, 24), 88 QPU_R(A, 25), 89 QPU_R(B, 25), 90 QPU_R(A, 26), 91 QPU_R(B, 26), 92 QPU_R(A, 27), 93 QPU_R(B, 27), 94 QPU_R(A, 28), 95 QPU_R(B, 28), 96 QPU_R(A, 29), 97 QPU_R(B, 29), 98 QPU_R(A, 30), 99 QPU_R(B, 30), 100 QPU_R(A, 31), 101 QPU_R(B, 31), 102 }; 103 #define ACC_INDEX 0 104 #define AB_INDEX (ACC_INDEX + 5) 105 106 static void 107 vc4_alloc_reg_set(struct vc4_context *vc4) 108 { 109 assert(vc4_regs[AB_INDEX].addr == 0); 110 assert(vc4_regs[AB_INDEX + 1].addr == 0); 111 STATIC_ASSERT(ARRAY_SIZE(vc4_regs) == AB_INDEX + 64); 112 113 if (vc4->regs) 114 return; 115 116 vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true); 117 118 /* The physical regfiles split us into two classes, with [0] being the 119 * whole space and [1] being the bottom half (for threaded fragment 120 * shaders). 121 */ 122 for (int i = 0; i < 2; i++) { 123 vc4->reg_class_any[i] = ra_alloc_reg_class(vc4->regs); 124 vc4->reg_class_a_or_b[i] = ra_alloc_reg_class(vc4->regs); 125 vc4->reg_class_a_or_b_or_acc[i] = ra_alloc_reg_class(vc4->regs); 126 vc4->reg_class_r4_or_a[i] = ra_alloc_reg_class(vc4->regs); 127 vc4->reg_class_a[i] = ra_alloc_reg_class(vc4->regs); 128 } 129 vc4->reg_class_r0_r3 = ra_alloc_reg_class(vc4->regs); 130 131 /* r0-r3 */ 132 for (uint32_t i = ACC_INDEX; i < ACC_INDEX + 4; i++) { 133 ra_class_add_reg(vc4->regs, vc4->reg_class_r0_r3, i); 134 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); 135 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); 136 } 137 138 /* R4 gets a special class because it can't be written as a general 139 * purpose register. (it's TMU_NOSWAP as a write address). 140 */ 141 for (int i = 0; i < 2; i++) { 142 ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[i], 143 ACC_INDEX + 4); 144 ra_class_add_reg(vc4->regs, vc4->reg_class_any[i], 145 ACC_INDEX + 4); 146 } 147 148 /* A/B */ 149 for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i ++) { 150 /* Reserve ra14/rb14 for spilling fixup_raddr_conflict() in 151 * vc4_qpu_emit.c 152 */ 153 if (vc4_regs[i].addr == 14) 154 continue; 155 156 ra_class_add_reg(vc4->regs, vc4->reg_class_any[0], i); 157 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[0], i); 158 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[0], i); 159 160 if (vc4_regs[i].addr < 16) { 161 ra_class_add_reg(vc4->regs, vc4->reg_class_any[1], i); 162 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b[1], i); 163 ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc[1], i); 164 } 165 166 167 /* A only */ 168 if (((i - AB_INDEX) & 1) == 0) { 169 ra_class_add_reg(vc4->regs, vc4->reg_class_a[0], i); 170 ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a[0], i); 171 172 if (vc4_regs[i].addr < 16) { 173 ra_class_add_reg(vc4->regs, 174 vc4->reg_class_a[1], i); 175 ra_class_add_reg(vc4->regs, 176 vc4->reg_class_r4_or_a[1], i); 177 } 178 } 179 } 180 181 ra_set_finalize(vc4->regs, NULL); 182 } 183 184 struct node_to_temp_map { 185 uint32_t temp; 186 uint32_t priority; 187 }; 188 189 static int 190 node_to_temp_priority(const void *in_a, const void *in_b) 191 { 192 const struct node_to_temp_map *a = in_a; 193 const struct node_to_temp_map *b = in_b; 194 195 return a->priority - b->priority; 196 } 197 198 #define CLASS_BIT_A (1 << 0) 199 #define CLASS_BIT_B (1 << 1) 200 #define CLASS_BIT_R4 (1 << 2) 201 #define CLASS_BIT_R0_R3 (1 << 4) 202 203 /** 204 * Returns a mapping from QFILE_TEMP indices to struct qpu_regs. 205 * 206 * The return value should be freed by the caller. 207 */ 208 struct qpu_reg * 209 vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) 210 { 211 struct node_to_temp_map map[c->num_temps]; 212 uint32_t temp_to_node[c->num_temps]; 213 uint8_t class_bits[c->num_temps]; 214 struct qpu_reg *temp_registers = calloc(c->num_temps, 215 sizeof(*temp_registers)); 216 217 /* If things aren't ever written (undefined values), just read from 218 * r0. 219 */ 220 for (uint32_t i = 0; i < c->num_temps; i++) 221 temp_registers[i] = qpu_rn(0); 222 223 vc4_alloc_reg_set(vc4); 224 225 struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, 226 c->num_temps); 227 228 /* Compute the live ranges so we can figure out interference. */ 229 qir_calculate_live_intervals(c); 230 231 for (uint32_t i = 0; i < c->num_temps; i++) { 232 map[i].temp = i; 233 map[i].priority = c->temp_end[i] - c->temp_start[i]; 234 } 235 qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); 236 for (uint32_t i = 0; i < c->num_temps; i++) { 237 temp_to_node[map[i].temp] = i; 238 } 239 240 /* Figure out our register classes and preallocated registers. We 241 * start with any temp being able to be in any file, then instructions 242 * incrementally remove bits that the temp definitely can't be in. 243 */ 244 memset(class_bits, 245 CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3, 246 sizeof(class_bits)); 247 248 int ip = 0; 249 qir_for_each_inst_inorder(inst, c) { 250 if (qir_writes_r4(inst)) { 251 /* This instruction writes r4 (and optionally moves 252 * its result to a temp), so nothing else can be 253 * stored in r4 across it. 254 */ 255 for (int i = 0; i < c->num_temps; i++) { 256 if (c->temp_start[i] < ip && c->temp_end[i] > ip) 257 class_bits[i] &= ~CLASS_BIT_R4; 258 } 259 } else { 260 /* R4 can't be written as a general purpose 261 * register. (it's TMU_NOSWAP as a write address). 262 */ 263 if (inst->dst.file == QFILE_TEMP) 264 class_bits[inst->dst.index] &= ~CLASS_BIT_R4; 265 } 266 267 switch (inst->op) { 268 case QOP_FRAG_Z: 269 ra_set_node_reg(g, temp_to_node[inst->dst.index], 270 AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1); 271 break; 272 273 case QOP_FRAG_W: 274 ra_set_node_reg(g, temp_to_node[inst->dst.index], 275 AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2); 276 break; 277 278 case QOP_ROT_MUL: 279 assert(inst->src[0].file == QFILE_TEMP); 280 class_bits[inst->src[0].index] &= CLASS_BIT_R0_R3; 281 break; 282 283 case QOP_THRSW: 284 /* All accumulators are invalidated across a thread 285 * switch. 286 */ 287 for (int i = 0; i < c->num_temps; i++) { 288 if (c->temp_start[i] < ip && c->temp_end[i] > ip) 289 class_bits[i] &= ~(CLASS_BIT_R0_R3 | 290 CLASS_BIT_R4); 291 } 292 break; 293 294 default: 295 break; 296 } 297 298 if (inst->dst.pack && !qir_is_mul(inst)) { 299 /* The non-MUL pack flags require an A-file dst 300 * register. 301 */ 302 class_bits[inst->dst.index] &= CLASS_BIT_A; 303 } 304 305 /* Apply restrictions for src unpacks. The integer unpacks 306 * can only be done from regfile A, while float unpacks can be 307 * either A or R4. 308 */ 309 for (int i = 0; i < qir_get_nsrc(inst); i++) { 310 if (inst->src[i].file == QFILE_TEMP && 311 inst->src[i].pack) { 312 if (qir_is_float_input(inst)) { 313 class_bits[inst->src[i].index] &= 314 CLASS_BIT_A | CLASS_BIT_R4; 315 } else { 316 class_bits[inst->src[i].index] &= 317 CLASS_BIT_A; 318 } 319 } 320 } 321 322 ip++; 323 } 324 325 for (uint32_t i = 0; i < c->num_temps; i++) { 326 int node = temp_to_node[i]; 327 328 switch (class_bits[i]) { 329 case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R4 | CLASS_BIT_R0_R3: 330 ra_set_node_class(g, node, 331 vc4->reg_class_any[c->fs_threaded]); 332 break; 333 case CLASS_BIT_A | CLASS_BIT_B: 334 ra_set_node_class(g, node, 335 vc4->reg_class_a_or_b[c->fs_threaded]); 336 break; 337 case CLASS_BIT_A | CLASS_BIT_B | CLASS_BIT_R0_R3: 338 ra_set_node_class(g, node, 339 vc4->reg_class_a_or_b_or_acc[c->fs_threaded]); 340 break; 341 case CLASS_BIT_A | CLASS_BIT_R4: 342 ra_set_node_class(g, node, 343 vc4->reg_class_r4_or_a[c->fs_threaded]); 344 break; 345 case CLASS_BIT_A: 346 ra_set_node_class(g, node, 347 vc4->reg_class_a[c->fs_threaded]); 348 break; 349 case CLASS_BIT_R0_R3: 350 ra_set_node_class(g, node, vc4->reg_class_r0_r3); 351 break; 352 353 default: 354 /* DDX/DDY used across thread switched might get us 355 * here. 356 */ 357 if (c->fs_threaded) { 358 c->failed = true; 359 free(temp_registers); 360 return NULL; 361 } 362 363 fprintf(stderr, "temp %d: bad class bits: 0x%x\n", 364 i, class_bits[i]); 365 abort(); 366 break; 367 } 368 } 369 370 for (uint32_t i = 0; i < c->num_temps; i++) { 371 for (uint32_t j = i + 1; j < c->num_temps; j++) { 372 if (!(c->temp_start[i] >= c->temp_end[j] || 373 c->temp_start[j] >= c->temp_end[i])) { 374 ra_add_node_interference(g, 375 temp_to_node[i], 376 temp_to_node[j]); 377 } 378 } 379 } 380 381 bool ok = ra_allocate(g); 382 if (!ok) { 383 if (!c->fs_threaded) { 384 fprintf(stderr, "Failed to register allocate:\n"); 385 qir_dump(c); 386 } 387 388 c->failed = true; 389 free(temp_registers); 390 return NULL; 391 } 392 393 for (uint32_t i = 0; i < c->num_temps; i++) { 394 temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])]; 395 396 /* If the value's never used, just write to the NOP register 397 * for clarity in debug output. 398 */ 399 if (c->temp_start[i] == c->temp_end[i]) 400 temp_registers[i] = qpu_ra(QPU_W_NOP); 401 } 402 403 ralloc_free(g); 404 405 return temp_registers; 406 } 407