1 /* 2 * Copyright 2010 Intel Corporation 3 * Copyright 2014 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 /** 26 * @file vc4_qpu_schedule.c 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30 * pick a DAG head, then put all the children that are now DAG heads into the 31 * list of things to schedule. 32 * 33 * The goal of scheduling here is to pack pairs of operations together in a 34 * single QPU instruction. 35 */ 36 37 #include "vc4_qir.h" 38 #include "vc4_qpu.h" 39 #include "util/ralloc.h" 40 41 static bool debug; 42 43 struct schedule_node_child; 44 45 struct schedule_node { 46 struct list_head link; 47 struct queued_qpu_inst *inst; 48 struct schedule_node_child *children; 49 uint32_t child_count; 50 uint32_t child_array_size; 51 uint32_t parent_count; 52 53 /* Longest cycles + instruction_latency() of any parent of this node. */ 54 uint32_t unblocked_time; 55 56 /** 57 * Minimum number of cycles from scheduling this instruction until the 58 * end of the program, based on the slowest dependency chain through 59 * the children. 60 */ 61 uint32_t delay; 62 63 /** 64 * cycles between this instruction being scheduled and when its result 65 * can be consumed. 66 */ 67 uint32_t latency; 68 69 /** 70 * Which uniform from uniform_data[] this instruction read, or -1 if 71 * not reading a uniform. 72 */ 73 int uniform; 74 }; 75 76 struct schedule_node_child { 77 struct schedule_node *node; 78 bool write_after_read; 79 }; 80 81 /* When walking the instructions in reverse, we need to swap before/after in 82 * add_dep(). 83 */ 84 enum direction { F, R }; 85 86 struct schedule_state { 87 struct schedule_node *last_r[6]; 88 struct schedule_node *last_ra[32]; 89 struct schedule_node *last_rb[32]; 90 struct schedule_node *last_sf; 91 struct schedule_node *last_vpm_read; 92 struct schedule_node *last_tmu_write; 93 struct schedule_node *last_tlb; 94 struct schedule_node *last_vpm; 95 struct schedule_node *last_uniforms_reset; 96 enum direction dir; 97 /* Estimated cycle when the current instruction would start. */ 98 uint32_t time; 99 }; 100 101 static void 102 add_dep(struct schedule_state *state, 103 struct schedule_node *before, 104 struct schedule_node *after, 105 bool write) 106 { 107 bool write_after_read = !write && state->dir == R; 108 109 if (!before || !after) 110 return; 111 112 assert(before != after); 113 114 if (state->dir == R) { 115 struct schedule_node *t = before; 116 before = after; 117 after = t; 118 } 119 120 for (int i = 0; i < before->child_count; i++) { 121 if (before->children[i].node == after && 122 (before->children[i].write_after_read == write_after_read)) { 123 return; 124 } 125 } 126 127 if (before->child_array_size <= before->child_count) { 128 before->child_array_size = MAX2(before->child_array_size * 2, 16); 129 before->children = reralloc(before, before->children, 130 struct schedule_node_child, 131 before->child_array_size); 132 } 133 134 before->children[before->child_count].node = after; 135 before->children[before->child_count].write_after_read = 136 write_after_read; 137 before->child_count++; 138 after->parent_count++; 139 } 140 141 static void 142 add_read_dep(struct schedule_state *state, 143 struct schedule_node *before, 144 struct schedule_node *after) 145 { 146 add_dep(state, before, after, false); 147 } 148 149 static void 150 add_write_dep(struct schedule_state *state, 151 struct schedule_node **before, 152 struct schedule_node *after) 153 { 154 add_dep(state, *before, after, true); 155 *before = after; 156 } 157 158 static bool 159 qpu_writes_r4(uint64_t inst) 160 { 161 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 162 163 switch(sig) { 164 case QPU_SIG_COLOR_LOAD: 165 case QPU_SIG_LOAD_TMU0: 166 case QPU_SIG_LOAD_TMU1: 167 case QPU_SIG_ALPHA_MASK_LOAD: 168 return true; 169 default: 170 return false; 171 } 172 } 173 174 static void 175 process_raddr_deps(struct schedule_state *state, struct schedule_node *n, 176 uint32_t raddr, bool is_a) 177 { 178 switch (raddr) { 179 case QPU_R_VARY: 180 add_write_dep(state, &state->last_r[5], n); 181 break; 182 183 case QPU_R_VPM: 184 add_write_dep(state, &state->last_vpm_read, n); 185 break; 186 187 case QPU_R_UNIF: 188 add_read_dep(state, state->last_uniforms_reset, n); 189 break; 190 191 case QPU_R_NOP: 192 case QPU_R_ELEM_QPU: 193 case QPU_R_XY_PIXEL_COORD: 194 case QPU_R_MS_REV_FLAGS: 195 break; 196 197 default: 198 if (raddr < 32) { 199 if (is_a) 200 add_read_dep(state, state->last_ra[raddr], n); 201 else 202 add_read_dep(state, state->last_rb[raddr], n); 203 } else { 204 fprintf(stderr, "unknown raddr %d\n", raddr); 205 abort(); 206 } 207 break; 208 } 209 } 210 211 static bool 212 is_tmu_write(uint32_t waddr) 213 { 214 switch (waddr) { 215 case QPU_W_TMU0_S: 216 case QPU_W_TMU0_T: 217 case QPU_W_TMU0_R: 218 case QPU_W_TMU0_B: 219 case QPU_W_TMU1_S: 220 case QPU_W_TMU1_T: 221 case QPU_W_TMU1_R: 222 case QPU_W_TMU1_B: 223 return true; 224 default: 225 return false; 226 } 227 } 228 229 static bool 230 reads_uniform(uint64_t inst) 231 { 232 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM) 233 return false; 234 235 return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF || 236 (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF && 237 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) || 238 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) || 239 is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); 240 } 241 242 static void 243 process_mux_deps(struct schedule_state *state, struct schedule_node *n, 244 uint32_t mux) 245 { 246 if (mux != QPU_MUX_A && mux != QPU_MUX_B) 247 add_read_dep(state, state->last_r[mux], n); 248 } 249 250 251 static void 252 process_waddr_deps(struct schedule_state *state, struct schedule_node *n, 253 uint32_t waddr, bool is_add) 254 { 255 uint64_t inst = n->inst->inst; 256 bool is_a = is_add ^ ((inst & QPU_WS) != 0); 257 258 if (waddr < 32) { 259 if (is_a) { 260 add_write_dep(state, &state->last_ra[waddr], n); 261 } else { 262 add_write_dep(state, &state->last_rb[waddr], n); 263 } 264 } else if (is_tmu_write(waddr)) { 265 add_write_dep(state, &state->last_tmu_write, n); 266 add_read_dep(state, state->last_uniforms_reset, n); 267 } else if (qpu_waddr_is_tlb(waddr) || 268 waddr == QPU_W_MS_FLAGS) { 269 add_write_dep(state, &state->last_tlb, n); 270 } else { 271 switch (waddr) { 272 case QPU_W_ACC0: 273 case QPU_W_ACC1: 274 case QPU_W_ACC2: 275 case QPU_W_ACC3: 276 case QPU_W_ACC5: 277 add_write_dep(state, &state->last_r[waddr - QPU_W_ACC0], 278 n); 279 break; 280 281 case QPU_W_VPM: 282 add_write_dep(state, &state->last_vpm, n); 283 break; 284 285 case QPU_W_VPMVCD_SETUP: 286 if (is_a) 287 add_write_dep(state, &state->last_vpm_read, n); 288 else 289 add_write_dep(state, &state->last_vpm, n); 290 break; 291 292 case QPU_W_SFU_RECIP: 293 case QPU_W_SFU_RECIPSQRT: 294 case QPU_W_SFU_EXP: 295 case QPU_W_SFU_LOG: 296 add_write_dep(state, &state->last_r[4], n); 297 break; 298 299 case QPU_W_TLB_STENCIL_SETUP: 300 /* This isn't a TLB operation that does things like 301 * implicitly lock the scoreboard, but it does have to 302 * appear before TLB_Z, and each of the TLB_STENCILs 303 * have to schedule in the same order relative to each 304 * other. 305 */ 306 add_write_dep(state, &state->last_tlb, n); 307 break; 308 309 case QPU_W_MS_FLAGS: 310 add_write_dep(state, &state->last_tlb, n); 311 break; 312 313 case QPU_W_UNIFORMS_ADDRESS: 314 add_write_dep(state, &state->last_uniforms_reset, n); 315 break; 316 317 case QPU_W_NOP: 318 break; 319 320 default: 321 fprintf(stderr, "Unknown waddr %d\n", waddr); 322 abort(); 323 } 324 } 325 } 326 327 static void 328 process_cond_deps(struct schedule_state *state, struct schedule_node *n, 329 uint32_t cond) 330 { 331 switch (cond) { 332 case QPU_COND_NEVER: 333 case QPU_COND_ALWAYS: 334 break; 335 default: 336 add_read_dep(state, state->last_sf, n); 337 break; 338 } 339 } 340 341 /** 342 * Common code for dependencies that need to be tracked both forward and 343 * backward. 344 * 345 * This is for things like "all reads of r4 have to happen between the r4 346 * writes that surround them". 347 */ 348 static void 349 calculate_deps(struct schedule_state *state, struct schedule_node *n) 350 { 351 uint64_t inst = n->inst->inst; 352 uint32_t add_op = QPU_GET_FIELD(inst, QPU_OP_ADD); 353 uint32_t mul_op = QPU_GET_FIELD(inst, QPU_OP_MUL); 354 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 355 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 356 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 357 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 358 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 359 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 360 uint32_t mul_a = QPU_GET_FIELD(inst, QPU_MUL_A); 361 uint32_t mul_b = QPU_GET_FIELD(inst, QPU_MUL_B); 362 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 363 364 if (sig != QPU_SIG_LOAD_IMM) { 365 process_raddr_deps(state, n, raddr_a, true); 366 if (sig != QPU_SIG_SMALL_IMM && 367 sig != QPU_SIG_BRANCH) 368 process_raddr_deps(state, n, raddr_b, false); 369 } 370 371 if (add_op != QPU_A_NOP) { 372 process_mux_deps(state, n, add_a); 373 process_mux_deps(state, n, add_b); 374 } 375 if (mul_op != QPU_M_NOP) { 376 process_mux_deps(state, n, mul_a); 377 process_mux_deps(state, n, mul_b); 378 } 379 380 process_waddr_deps(state, n, waddr_add, true); 381 process_waddr_deps(state, n, waddr_mul, false); 382 if (qpu_writes_r4(inst)) 383 add_write_dep(state, &state->last_r[4], n); 384 385 switch (sig) { 386 case QPU_SIG_SW_BREAKPOINT: 387 case QPU_SIG_NONE: 388 case QPU_SIG_SMALL_IMM: 389 case QPU_SIG_LOAD_IMM: 390 break; 391 392 case QPU_SIG_THREAD_SWITCH: 393 case QPU_SIG_LAST_THREAD_SWITCH: 394 /* All accumulator contents and flags are undefined after the 395 * switch. 396 */ 397 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 398 add_write_dep(state, &state->last_r[i], n); 399 add_write_dep(state, &state->last_sf, n); 400 401 /* Scoreboard-locking operations have to stay after the last 402 * thread switch. 403 */ 404 add_write_dep(state, &state->last_tlb, n); 405 406 add_write_dep(state, &state->last_tmu_write, n); 407 break; 408 409 case QPU_SIG_LOAD_TMU0: 410 case QPU_SIG_LOAD_TMU1: 411 /* TMU loads are coming from a FIFO, so ordering is important. 412 */ 413 add_write_dep(state, &state->last_tmu_write, n); 414 break; 415 416 case QPU_SIG_COLOR_LOAD: 417 add_read_dep(state, state->last_tlb, n); 418 break; 419 420 case QPU_SIG_BRANCH: 421 add_read_dep(state, state->last_sf, n); 422 break; 423 424 case QPU_SIG_PROG_END: 425 case QPU_SIG_WAIT_FOR_SCOREBOARD: 426 case QPU_SIG_SCOREBOARD_UNLOCK: 427 case QPU_SIG_COVERAGE_LOAD: 428 case QPU_SIG_COLOR_LOAD_END: 429 case QPU_SIG_ALPHA_MASK_LOAD: 430 fprintf(stderr, "Unhandled signal bits %d\n", sig); 431 abort(); 432 } 433 434 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD)); 435 process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL)); 436 if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH) 437 add_write_dep(state, &state->last_sf, n); 438 } 439 440 static void 441 calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list) 442 { 443 struct schedule_state state; 444 445 memset(&state, 0, sizeof(state)); 446 state.dir = F; 447 448 list_for_each_entry(struct schedule_node, node, schedule_list, link) 449 calculate_deps(&state, node); 450 } 451 452 static void 453 calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list) 454 { 455 struct list_head *node; 456 struct schedule_state state; 457 458 memset(&state, 0, sizeof(state)); 459 state.dir = R; 460 461 for (node = schedule_list->prev; schedule_list != node; node = node->prev) { 462 calculate_deps(&state, (struct schedule_node *)node); 463 } 464 } 465 466 struct choose_scoreboard { 467 int tick; 468 int last_sfu_write_tick; 469 int last_uniforms_reset_tick; 470 uint32_t last_waddr_a, last_waddr_b; 471 bool tlb_locked; 472 }; 473 474 static bool 475 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) 476 { 477 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 478 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 479 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 480 481 /* Full immediate loads don't read any registers. */ 482 if (sig == QPU_SIG_LOAD_IMM) 483 return false; 484 485 uint32_t src_muxes[] = { 486 QPU_GET_FIELD(inst, QPU_ADD_A), 487 QPU_GET_FIELD(inst, QPU_ADD_B), 488 QPU_GET_FIELD(inst, QPU_MUL_A), 489 QPU_GET_FIELD(inst, QPU_MUL_B), 490 }; 491 for (int i = 0; i < ARRAY_SIZE(src_muxes); i++) { 492 if ((src_muxes[i] == QPU_MUX_A && 493 raddr_a < 32 && 494 scoreboard->last_waddr_a == raddr_a) || 495 (src_muxes[i] == QPU_MUX_B && 496 sig != QPU_SIG_SMALL_IMM && 497 raddr_b < 32 && 498 scoreboard->last_waddr_b == raddr_b)) { 499 return true; 500 } 501 502 if (src_muxes[i] == QPU_MUX_R4) { 503 if (scoreboard->tick - 504 scoreboard->last_sfu_write_tick <= 2) { 505 return true; 506 } 507 } 508 } 509 510 if (sig == QPU_SIG_SMALL_IMM && 511 QPU_GET_FIELD(inst, QPU_SMALL_IMM) >= QPU_SMALL_IMM_MUL_ROT) { 512 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A); 513 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B); 514 515 if (scoreboard->last_waddr_a == mux_a + QPU_W_ACC0 || 516 scoreboard->last_waddr_a == mux_b + QPU_W_ACC0 || 517 scoreboard->last_waddr_b == mux_a + QPU_W_ACC0 || 518 scoreboard->last_waddr_b == mux_b + QPU_W_ACC0) { 519 return true; 520 } 521 } 522 523 if (reads_uniform(inst) && 524 scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) { 525 return true; 526 } 527 528 return false; 529 } 530 531 static bool 532 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, uint64_t inst) 533 { 534 return (scoreboard->tick < 2 && qpu_inst_is_tlb(inst)); 535 } 536 537 static int 538 get_instruction_priority(uint64_t inst) 539 { 540 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 541 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 542 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 543 uint32_t baseline_score; 544 uint32_t next_score = 0; 545 546 /* Schedule TLB operations as late as possible, to get more 547 * parallelism between shaders. 548 */ 549 if (qpu_inst_is_tlb(inst)) 550 return next_score; 551 next_score++; 552 553 /* Schedule texture read results collection late to hide latency. */ 554 if (sig == QPU_SIG_LOAD_TMU0 || sig == QPU_SIG_LOAD_TMU1) 555 return next_score; 556 next_score++; 557 558 /* Default score for things that aren't otherwise special. */ 559 baseline_score = next_score; 560 next_score++; 561 562 /* Schedule texture read setup early to hide their latency better. */ 563 if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) 564 return next_score; 565 next_score++; 566 567 return baseline_score; 568 } 569 570 static struct schedule_node * 571 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, 572 struct list_head *schedule_list, 573 struct schedule_node *prev_inst) 574 { 575 struct schedule_node *chosen = NULL; 576 int chosen_prio = 0; 577 578 /* Don't pair up anything with a thread switch signal -- emit_thrsw() 579 * will handle pairing it along with filling the delay slots. 580 */ 581 if (prev_inst) { 582 uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst, 583 QPU_SIG); 584 if (prev_sig == QPU_SIG_THREAD_SWITCH || 585 prev_sig == QPU_SIG_LAST_THREAD_SWITCH) { 586 return NULL; 587 } 588 } 589 590 list_for_each_entry(struct schedule_node, n, schedule_list, link) { 591 uint64_t inst = n->inst->inst; 592 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 593 594 /* Don't choose the branch instruction until it's the last one 595 * left. XXX: We could potentially choose it before it's the 596 * last one, if the remaining instructions fit in the delay 597 * slots. 598 */ 599 if (sig == QPU_SIG_BRANCH && 600 !list_is_singular(schedule_list)) { 601 continue; 602 } 603 604 /* "An instruction must not read from a location in physical 605 * regfile A or B that was written to by the previous 606 * instruction." 607 */ 608 if (reads_too_soon_after_write(scoreboard, inst)) 609 continue; 610 611 /* "A scoreboard wait must not occur in the first two 612 * instructions of a fragment shader. This is either the 613 * explicit Wait for Scoreboard signal or an implicit wait 614 * with the first tile-buffer read or write instruction." 615 */ 616 if (pixel_scoreboard_too_soon(scoreboard, inst)) 617 continue; 618 619 /* If we're trying to pair with another instruction, check 620 * that they're compatible. 621 */ 622 if (prev_inst) { 623 /* Don't pair up a thread switch signal -- we'll 624 * handle pairing it when we pick it on its own. 625 */ 626 if (sig == QPU_SIG_THREAD_SWITCH || 627 sig == QPU_SIG_LAST_THREAD_SWITCH) { 628 continue; 629 } 630 631 if (prev_inst->uniform != -1 && n->uniform != -1) 632 continue; 633 634 /* Don't merge in something that will lock the TLB. 635 * Hopwefully what we have in inst will release some 636 * other instructions, allowing us to delay the 637 * TLB-locking instruction until later. 638 */ 639 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) 640 continue; 641 642 inst = qpu_merge_inst(prev_inst->inst->inst, inst); 643 if (!inst) 644 continue; 645 } 646 647 int prio = get_instruction_priority(inst); 648 649 /* Found a valid instruction. If nothing better comes along, 650 * this one works. 651 */ 652 if (!chosen) { 653 chosen = n; 654 chosen_prio = prio; 655 continue; 656 } 657 658 if (prio > chosen_prio) { 659 chosen = n; 660 chosen_prio = prio; 661 } else if (prio < chosen_prio) { 662 continue; 663 } 664 665 if (n->delay > chosen->delay) { 666 chosen = n; 667 chosen_prio = prio; 668 } else if (n->delay < chosen->delay) { 669 continue; 670 } 671 } 672 673 return chosen; 674 } 675 676 static void 677 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 678 uint64_t inst) 679 { 680 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 681 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 682 683 if (!(inst & QPU_WS)) { 684 scoreboard->last_waddr_a = waddr_add; 685 scoreboard->last_waddr_b = waddr_mul; 686 } else { 687 scoreboard->last_waddr_b = waddr_add; 688 scoreboard->last_waddr_a = waddr_mul; 689 } 690 691 if ((waddr_add >= QPU_W_SFU_RECIP && waddr_add <= QPU_W_SFU_LOG) || 692 (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) { 693 scoreboard->last_sfu_write_tick = scoreboard->tick; 694 } 695 696 if (waddr_add == QPU_W_UNIFORMS_ADDRESS || 697 waddr_mul == QPU_W_UNIFORMS_ADDRESS) { 698 scoreboard->last_uniforms_reset_tick = scoreboard->tick; 699 } 700 701 if (qpu_inst_is_tlb(inst)) 702 scoreboard->tlb_locked = true; 703 } 704 705 static void 706 dump_state(struct list_head *schedule_list) 707 { 708 list_for_each_entry(struct schedule_node, n, schedule_list, link) { 709 fprintf(stderr, " t=%4d: ", n->unblocked_time); 710 vc4_qpu_disasm(&n->inst->inst, 1); 711 fprintf(stderr, "\n"); 712 713 for (int i = 0; i < n->child_count; i++) { 714 struct schedule_node *child = n->children[i].node; 715 if (!child) 716 continue; 717 718 fprintf(stderr, " - "); 719 vc4_qpu_disasm(&child->inst->inst, 1); 720 fprintf(stderr, " (%d parents, %c)\n", 721 child->parent_count, 722 n->children[i].write_after_read ? 'w' : 'r'); 723 } 724 } 725 } 726 727 static uint32_t waddr_latency(uint32_t waddr, uint64_t after) 728 { 729 if (waddr < 32) 730 return 2; 731 732 /* Apply some huge latency between texture fetch requests and getting 733 * their results back. 734 * 735 * FIXME: This is actually pretty bogus. If we do: 736 * 737 * mov tmu0_s, a 738 * <a bit of math> 739 * mov tmu0_s, b 740 * load_tmu0 741 * <more math> 742 * load_tmu0 743 * 744 * we count that as worse than 745 * 746 * mov tmu0_s, a 747 * mov tmu0_s, b 748 * <lots of math> 749 * load_tmu0 750 * <more math> 751 * load_tmu0 752 * 753 * because we associate the first load_tmu0 with the *second* tmu0_s. 754 */ 755 if (waddr == QPU_W_TMU0_S) { 756 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0) 757 return 100; 758 } 759 if (waddr == QPU_W_TMU1_S) { 760 if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1) 761 return 100; 762 } 763 764 switch(waddr) { 765 case QPU_W_SFU_RECIP: 766 case QPU_W_SFU_RECIPSQRT: 767 case QPU_W_SFU_EXP: 768 case QPU_W_SFU_LOG: 769 return 3; 770 default: 771 return 1; 772 } 773 } 774 775 static uint32_t 776 instruction_latency(struct schedule_node *before, struct schedule_node *after) 777 { 778 uint64_t before_inst = before->inst->inst; 779 uint64_t after_inst = after->inst->inst; 780 781 return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD), 782 after_inst), 783 waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL), 784 after_inst)); 785 } 786 787 /** Recursive computation of the delay member of a node. */ 788 static void 789 compute_delay(struct schedule_node *n) 790 { 791 if (!n->child_count) { 792 n->delay = 1; 793 } else { 794 for (int i = 0; i < n->child_count; i++) { 795 if (!n->children[i].node->delay) 796 compute_delay(n->children[i].node); 797 n->delay = MAX2(n->delay, 798 n->children[i].node->delay + 799 instruction_latency(n, n->children[i].node)); 800 } 801 } 802 } 803 804 static void 805 mark_instruction_scheduled(struct list_head *schedule_list, 806 uint32_t time, 807 struct schedule_node *node, 808 bool war_only) 809 { 810 if (!node) 811 return; 812 813 for (int i = node->child_count - 1; i >= 0; i--) { 814 struct schedule_node *child = 815 node->children[i].node; 816 817 if (!child) 818 continue; 819 820 if (war_only && !node->children[i].write_after_read) 821 continue; 822 823 /* If the requirement is only that the node not appear before 824 * the last read of its destination, then it can be scheduled 825 * immediately after (or paired with!) the thing reading the 826 * destination. 827 */ 828 uint32_t latency = 0; 829 if (!war_only) { 830 latency = instruction_latency(node, 831 node->children[i].node); 832 } 833 834 child->unblocked_time = MAX2(child->unblocked_time, 835 time + latency); 836 child->parent_count--; 837 if (child->parent_count == 0) 838 list_add(&child->link, schedule_list); 839 840 node->children[i].node = NULL; 841 } 842 } 843 844 /** 845 * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair 846 * with another instruction. 847 */ 848 static void 849 emit_thrsw(struct vc4_compile *c, 850 struct choose_scoreboard *scoreboard, 851 uint64_t inst) 852 { 853 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 854 855 /* There should be nothing in a thrsw inst being scheduled other than 856 * the signal bits. 857 */ 858 assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP); 859 assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP); 860 861 /* Try to find an earlier scheduled instruction that we can merge the 862 * thrsw into. 863 */ 864 int thrsw_ip = c->qpu_inst_count; 865 for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) { 866 uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i]; 867 uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG); 868 869 if (prev_sig == QPU_SIG_NONE) 870 thrsw_ip = c->qpu_inst_count - i; 871 } 872 873 if (thrsw_ip != c->qpu_inst_count) { 874 /* Merge the thrsw into the existing instruction. */ 875 c->qpu_insts[thrsw_ip] = 876 QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG); 877 } else { 878 qpu_serialize_one_inst(c, inst); 879 update_scoreboard_for_chosen(scoreboard, inst); 880 } 881 882 /* Fill the delay slots. */ 883 while (c->qpu_inst_count < thrsw_ip + 3) { 884 update_scoreboard_for_chosen(scoreboard, qpu_NOP()); 885 qpu_serialize_one_inst(c, qpu_NOP()); 886 } 887 } 888 889 static uint32_t 890 schedule_instructions(struct vc4_compile *c, 891 struct choose_scoreboard *scoreboard, 892 struct qblock *block, 893 struct list_head *schedule_list, 894 enum quniform_contents *orig_uniform_contents, 895 uint32_t *orig_uniform_data, 896 uint32_t *next_uniform) 897 { 898 uint32_t time = 0; 899 900 if (debug) { 901 fprintf(stderr, "initial deps:\n"); 902 dump_state(schedule_list); 903 fprintf(stderr, "\n"); 904 } 905 906 /* Remove non-DAG heads from the list. */ 907 list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { 908 if (n->parent_count != 0) 909 list_del(&n->link); 910 } 911 912 while (!list_empty(schedule_list)) { 913 struct schedule_node *chosen = 914 choose_instruction_to_schedule(scoreboard, 915 schedule_list, 916 NULL); 917 struct schedule_node *merge = NULL; 918 919 /* If there are no valid instructions to schedule, drop a NOP 920 * in. 921 */ 922 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP(); 923 924 if (debug) { 925 fprintf(stderr, "t=%4d: current list:\n", 926 time); 927 dump_state(schedule_list); 928 fprintf(stderr, "t=%4d: chose: ", time); 929 vc4_qpu_disasm(&inst, 1); 930 fprintf(stderr, "\n"); 931 } 932 933 /* Schedule this instruction onto the QPU list. Also try to 934 * find an instruction to pair with it. 935 */ 936 if (chosen) { 937 time = MAX2(chosen->unblocked_time, time); 938 list_del(&chosen->link); 939 mark_instruction_scheduled(schedule_list, time, 940 chosen, true); 941 if (chosen->uniform != -1) { 942 c->uniform_data[*next_uniform] = 943 orig_uniform_data[chosen->uniform]; 944 c->uniform_contents[*next_uniform] = 945 orig_uniform_contents[chosen->uniform]; 946 (*next_uniform)++; 947 } 948 949 merge = choose_instruction_to_schedule(scoreboard, 950 schedule_list, 951 chosen); 952 if (merge) { 953 time = MAX2(merge->unblocked_time, time); 954 list_del(&merge->link); 955 inst = qpu_merge_inst(inst, merge->inst->inst); 956 assert(inst != 0); 957 if (merge->uniform != -1) { 958 c->uniform_data[*next_uniform] = 959 orig_uniform_data[merge->uniform]; 960 c->uniform_contents[*next_uniform] = 961 orig_uniform_contents[merge->uniform]; 962 (*next_uniform)++; 963 } 964 965 if (debug) { 966 fprintf(stderr, "t=%4d: merging: ", 967 time); 968 vc4_qpu_disasm(&merge->inst->inst, 1); 969 fprintf(stderr, "\n"); 970 fprintf(stderr, " resulting in: "); 971 vc4_qpu_disasm(&inst, 1); 972 fprintf(stderr, "\n"); 973 } 974 } 975 } 976 977 if (debug) { 978 fprintf(stderr, "\n"); 979 } 980 981 /* Now that we've scheduled a new instruction, some of its 982 * children can be promoted to the list of instructions ready to 983 * be scheduled. Update the children's unblocked time for this 984 * DAG edge as we do so. 985 */ 986 mark_instruction_scheduled(schedule_list, time, chosen, false); 987 mark_instruction_scheduled(schedule_list, time, merge, false); 988 989 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH || 990 QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) { 991 emit_thrsw(c, scoreboard, inst); 992 } else { 993 qpu_serialize_one_inst(c, inst); 994 update_scoreboard_for_chosen(scoreboard, inst); 995 } 996 997 scoreboard->tick++; 998 time++; 999 1000 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) { 1001 block->branch_qpu_ip = c->qpu_inst_count - 1; 1002 /* Fill the delay slots. 1003 * 1004 * We should fill these with actual instructions, 1005 * instead, but that will probably need to be done 1006 * after this, once we know what the leading 1007 * instructions of the successors are (so we can 1008 * handle A/B register file write latency) 1009 */ 1010 inst = qpu_NOP(); 1011 update_scoreboard_for_chosen(scoreboard, inst); 1012 qpu_serialize_one_inst(c, inst); 1013 qpu_serialize_one_inst(c, inst); 1014 qpu_serialize_one_inst(c, inst); 1015 } 1016 } 1017 1018 return time; 1019 } 1020 1021 static uint32_t 1022 qpu_schedule_instructions_block(struct vc4_compile *c, 1023 struct choose_scoreboard *scoreboard, 1024 struct qblock *block, 1025 enum quniform_contents *orig_uniform_contents, 1026 uint32_t *orig_uniform_data, 1027 uint32_t *next_uniform) 1028 { 1029 void *mem_ctx = ralloc_context(NULL); 1030 struct list_head schedule_list; 1031 1032 list_inithead(&schedule_list); 1033 1034 /* Wrap each instruction in a scheduler structure. */ 1035 uint32_t next_sched_uniform = *next_uniform; 1036 while (!list_empty(&block->qpu_inst_list)) { 1037 struct queued_qpu_inst *inst = 1038 (struct queued_qpu_inst *)block->qpu_inst_list.next; 1039 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); 1040 1041 n->inst = inst; 1042 1043 if (reads_uniform(inst->inst)) { 1044 n->uniform = next_sched_uniform++; 1045 } else { 1046 n->uniform = -1; 1047 } 1048 list_del(&inst->link); 1049 list_addtail(&n->link, &schedule_list); 1050 } 1051 1052 calculate_forward_deps(c, &schedule_list); 1053 calculate_reverse_deps(c, &schedule_list); 1054 1055 list_for_each_entry(struct schedule_node, n, &schedule_list, link) { 1056 compute_delay(n); 1057 } 1058 1059 uint32_t cycles = schedule_instructions(c, scoreboard, block, 1060 &schedule_list, 1061 orig_uniform_contents, 1062 orig_uniform_data, 1063 next_uniform); 1064 1065 ralloc_free(mem_ctx); 1066 1067 return cycles; 1068 } 1069 1070 static void 1071 qpu_set_branch_targets(struct vc4_compile *c) 1072 { 1073 qir_for_each_block(block, c) { 1074 /* The end block of the program has no branch. */ 1075 if (!block->successors[0]) 1076 continue; 1077 1078 /* If there was no branch instruction, then the successor 1079 * block must follow immediately after this one. 1080 */ 1081 if (block->branch_qpu_ip == ~0) { 1082 assert(block->end_qpu_ip + 1 == 1083 block->successors[0]->start_qpu_ip); 1084 continue; 1085 } 1086 1087 /* Set the branch target for the block that doesn't follow 1088 * immediately after ours. 1089 */ 1090 uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip]; 1091 assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH); 1092 assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0); 1093 1094 uint32_t branch_target = 1095 (block->successors[0]->start_qpu_ip - 1096 (block->branch_qpu_ip + 4)) * sizeof(uint64_t); 1097 *branch_inst = (*branch_inst | 1098 QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET)); 1099 1100 /* Make sure that the if-we-don't-jump successor was scheduled 1101 * just after the delay slots. 1102 */ 1103 if (block->successors[1]) { 1104 assert(block->successors[1]->start_qpu_ip == 1105 block->branch_qpu_ip + 4); 1106 } 1107 } 1108 } 1109 1110 uint32_t 1111 qpu_schedule_instructions(struct vc4_compile *c) 1112 { 1113 /* We reorder the uniforms as we schedule instructions, so save the 1114 * old data off and replace it. 1115 */ 1116 uint32_t *uniform_data = c->uniform_data; 1117 enum quniform_contents *uniform_contents = c->uniform_contents; 1118 c->uniform_contents = ralloc_array(c, enum quniform_contents, 1119 c->num_uniforms); 1120 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 1121 c->uniform_array_size = c->num_uniforms; 1122 uint32_t next_uniform = 0; 1123 1124 struct choose_scoreboard scoreboard; 1125 memset(&scoreboard, 0, sizeof(scoreboard)); 1126 scoreboard.last_waddr_a = ~0; 1127 scoreboard.last_waddr_b = ~0; 1128 scoreboard.last_sfu_write_tick = -10; 1129 scoreboard.last_uniforms_reset_tick = -10; 1130 1131 if (debug) { 1132 fprintf(stderr, "Pre-schedule instructions\n"); 1133 qir_for_each_block(block, c) { 1134 fprintf(stderr, "BLOCK %d\n", block->index); 1135 list_for_each_entry(struct queued_qpu_inst, q, 1136 &block->qpu_inst_list, link) { 1137 vc4_qpu_disasm(&q->inst, 1); 1138 fprintf(stderr, "\n"); 1139 } 1140 } 1141 fprintf(stderr, "\n"); 1142 } 1143 1144 uint32_t cycles = 0; 1145 qir_for_each_block(block, c) { 1146 block->start_qpu_ip = c->qpu_inst_count; 1147 block->branch_qpu_ip = ~0; 1148 1149 cycles += qpu_schedule_instructions_block(c, 1150 &scoreboard, 1151 block, 1152 uniform_contents, 1153 uniform_data, 1154 &next_uniform); 1155 1156 block->end_qpu_ip = c->qpu_inst_count - 1; 1157 } 1158 1159 qpu_set_branch_targets(c); 1160 1161 assert(next_uniform == c->num_uniforms); 1162 1163 if (debug) { 1164 fprintf(stderr, "Post-schedule instructions\n"); 1165 vc4_qpu_disasm(c->qpu_insts, c->qpu_inst_count); 1166 fprintf(stderr, "\n"); 1167 } 1168 1169 return cycles; 1170 } 1171