1 /* 2 * Copyright 2010 Intel Corporation 3 * Copyright 2014-2017 Broadcom 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 /** 26 * @file 27 * 28 * The basic model of the list scheduler is to take a basic block, compute a 29 * DAG of the dependencies, and make a list of the DAG heads. Heuristically 30 * pick a DAG head, then put all the children that are now DAG heads into the 31 * list of things to schedule. 32 * 33 * The goal of scheduling here is to pack pairs of operations together in a 34 * single QPU instruction. 35 */ 36 37 #include "qpu/qpu_disasm.h" 38 #include "v3d_compiler.h" 39 #include "util/ralloc.h" 40 41 static bool debug; 42 43 struct schedule_node_child; 44 45 struct schedule_node { 46 struct list_head link; 47 struct qinst *inst; 48 struct schedule_node_child *children; 49 uint32_t child_count; 50 uint32_t child_array_size; 51 uint32_t parent_count; 52 53 /* Longest cycles + instruction_latency() of any parent of this node. */ 54 uint32_t unblocked_time; 55 56 /** 57 * Minimum number of cycles from scheduling this instruction until the 58 * end of the program, based on the slowest dependency chain through 59 * the children. 60 */ 61 uint32_t delay; 62 63 /** 64 * cycles between this instruction being scheduled and when its result 65 * can be consumed. 66 */ 67 uint32_t latency; 68 }; 69 70 struct schedule_node_child { 71 struct schedule_node *node; 72 bool write_after_read; 73 }; 74 75 /* When walking the instructions in reverse, we need to swap before/after in 76 * add_dep(). 77 */ 78 enum direction { F, R }; 79 80 struct schedule_state { 81 const struct v3d_device_info *devinfo; 82 struct schedule_node *last_r[6]; 83 struct schedule_node *last_rf[64]; 84 struct schedule_node *last_sf; 85 struct schedule_node *last_vpm_read; 86 struct schedule_node *last_tmu_write; 87 struct schedule_node *last_tmu_config; 88 struct schedule_node *last_tlb; 89 struct schedule_node *last_vpm; 90 struct schedule_node *last_unif; 91 struct schedule_node *last_rtop; 92 enum direction dir; 93 /* Estimated cycle when the current instruction would start. */ 94 uint32_t time; 95 }; 96 97 static void 98 add_dep(struct schedule_state *state, 99 struct schedule_node *before, 100 struct schedule_node *after, 101 bool write) 102 { 103 bool write_after_read = !write && state->dir == R; 104 105 if (!before || !after) 106 return; 107 108 assert(before != after); 109 110 if (state->dir == R) { 111 struct schedule_node *t = before; 112 before = after; 113 after = t; 114 } 115 116 for (int i = 0; i < before->child_count; i++) { 117 if (before->children[i].node == after && 118 (before->children[i].write_after_read == write_after_read)) { 119 return; 120 } 121 } 122 123 if (before->child_array_size <= before->child_count) { 124 before->child_array_size = MAX2(before->child_array_size * 2, 16); 125 before->children = reralloc(before, before->children, 126 struct schedule_node_child, 127 before->child_array_size); 128 } 129 130 before->children[before->child_count].node = after; 131 before->children[before->child_count].write_after_read = 132 write_after_read; 133 before->child_count++; 134 after->parent_count++; 135 } 136 137 static void 138 add_read_dep(struct schedule_state *state, 139 struct schedule_node *before, 140 struct schedule_node *after) 141 { 142 add_dep(state, before, after, false); 143 } 144 145 static void 146 add_write_dep(struct schedule_state *state, 147 struct schedule_node **before, 148 struct schedule_node *after) 149 { 150 add_dep(state, *before, after, true); 151 *before = after; 152 } 153 154 static bool 155 qpu_inst_is_tlb(const struct v3d_qpu_instr *inst) 156 { 157 if (inst->type != V3D_QPU_INSTR_TYPE_ALU) 158 return false; 159 160 if (inst->alu.add.magic_write && 161 (inst->alu.add.waddr == V3D_QPU_WADDR_TLB || 162 inst->alu.add.waddr == V3D_QPU_WADDR_TLBU)) 163 return true; 164 165 if (inst->alu.mul.magic_write && 166 (inst->alu.mul.waddr == V3D_QPU_WADDR_TLB || 167 inst->alu.mul.waddr == V3D_QPU_WADDR_TLBU)) 168 return true; 169 170 return false; 171 } 172 173 static void 174 process_mux_deps(struct schedule_state *state, struct schedule_node *n, 175 enum v3d_qpu_mux mux) 176 { 177 switch (mux) { 178 case V3D_QPU_MUX_A: 179 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n); 180 break; 181 case V3D_QPU_MUX_B: 182 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_b], n); 183 break; 184 default: 185 add_read_dep(state, state->last_r[mux - V3D_QPU_MUX_R0], n); 186 break; 187 } 188 } 189 190 191 static void 192 process_waddr_deps(struct schedule_state *state, struct schedule_node *n, 193 uint32_t waddr, bool magic) 194 { 195 if (!magic) { 196 add_write_dep(state, &state->last_rf[waddr], n); 197 } else if (v3d_qpu_magic_waddr_is_tmu(waddr)) { 198 add_write_dep(state, &state->last_tmu_write, n); 199 switch (waddr) { 200 case V3D_QPU_WADDR_TMUS: 201 case V3D_QPU_WADDR_TMUSCM: 202 case V3D_QPU_WADDR_TMUSF: 203 case V3D_QPU_WADDR_TMUSLOD: 204 add_write_dep(state, &state->last_tmu_config, n); 205 break; 206 default: 207 break; 208 } 209 } else if (v3d_qpu_magic_waddr_is_sfu(waddr)) { 210 /* Handled by v3d_qpu_writes_r4() check. */ 211 } else { 212 switch (waddr) { 213 case V3D_QPU_WADDR_R0: 214 case V3D_QPU_WADDR_R1: 215 case V3D_QPU_WADDR_R2: 216 add_write_dep(state, 217 &state->last_r[waddr - V3D_QPU_WADDR_R0], 218 n); 219 break; 220 case V3D_QPU_WADDR_R3: 221 case V3D_QPU_WADDR_R4: 222 case V3D_QPU_WADDR_R5: 223 /* Handled by v3d_qpu_writes_r*() checks below. */ 224 break; 225 226 case V3D_QPU_WADDR_VPM: 227 case V3D_QPU_WADDR_VPMU: 228 add_write_dep(state, &state->last_vpm, n); 229 break; 230 231 case V3D_QPU_WADDR_TLB: 232 case V3D_QPU_WADDR_TLBU: 233 add_write_dep(state, &state->last_tlb, n); 234 break; 235 236 case V3D_QPU_WADDR_NOP: 237 break; 238 239 default: 240 fprintf(stderr, "Unknown waddr %d\n", waddr); 241 abort(); 242 } 243 } 244 } 245 246 static void 247 process_cond_deps(struct schedule_state *state, struct schedule_node *n, 248 enum v3d_qpu_cond cond) 249 { 250 if (cond != V3D_QPU_COND_NONE) 251 add_read_dep(state, state->last_sf, n); 252 } 253 254 static void 255 process_pf_deps(struct schedule_state *state, struct schedule_node *n, 256 enum v3d_qpu_pf pf) 257 { 258 if (pf != V3D_QPU_PF_NONE) 259 add_write_dep(state, &state->last_sf, n); 260 } 261 262 static void 263 process_uf_deps(struct schedule_state *state, struct schedule_node *n, 264 enum v3d_qpu_uf uf) 265 { 266 if (uf != V3D_QPU_UF_NONE) 267 add_write_dep(state, &state->last_sf, n); 268 } 269 270 /** 271 * Common code for dependencies that need to be tracked both forward and 272 * backward. 273 * 274 * This is for things like "all reads of r4 have to happen between the r4 275 * writes that surround them". 276 */ 277 static void 278 calculate_deps(struct schedule_state *state, struct schedule_node *n) 279 { 280 const struct v3d_device_info *devinfo = state->devinfo; 281 struct qinst *qinst = n->inst; 282 struct v3d_qpu_instr *inst = &qinst->qpu; 283 284 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 285 if (inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) 286 add_read_dep(state, state->last_sf, n); 287 288 /* XXX: BDI */ 289 /* XXX: BDU */ 290 /* XXX: ub */ 291 /* XXX: raddr_a */ 292 293 add_write_dep(state, &state->last_unif, n); 294 return; 295 } 296 297 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 298 299 /* XXX: LOAD_IMM */ 300 301 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) 302 process_mux_deps(state, n, inst->alu.add.a); 303 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) 304 process_mux_deps(state, n, inst->alu.add.b); 305 306 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) 307 process_mux_deps(state, n, inst->alu.mul.a); 308 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) 309 process_mux_deps(state, n, inst->alu.mul.b); 310 311 switch (inst->alu.add.op) { 312 case V3D_QPU_A_VPMSETUP: 313 /* Could distinguish read/write by unpacking the uniform. */ 314 add_write_dep(state, &state->last_vpm, n); 315 add_write_dep(state, &state->last_vpm_read, n); 316 break; 317 318 case V3D_QPU_A_STVPMV: 319 case V3D_QPU_A_STVPMD: 320 case V3D_QPU_A_STVPMP: 321 add_write_dep(state, &state->last_vpm, n); 322 break; 323 324 case V3D_QPU_A_VPMWT: 325 add_read_dep(state, state->last_vpm, n); 326 break; 327 328 case V3D_QPU_A_MSF: 329 add_read_dep(state, state->last_tlb, n); 330 break; 331 332 case V3D_QPU_A_SETMSF: 333 case V3D_QPU_A_SETREVF: 334 add_write_dep(state, &state->last_tlb, n); 335 break; 336 337 case V3D_QPU_A_FLAPUSH: 338 case V3D_QPU_A_FLBPUSH: 339 case V3D_QPU_A_VFLA: 340 case V3D_QPU_A_VFLNA: 341 case V3D_QPU_A_VFLB: 342 case V3D_QPU_A_VFLNB: 343 add_read_dep(state, state->last_sf, n); 344 break; 345 346 case V3D_QPU_A_FLBPOP: 347 add_write_dep(state, &state->last_sf, n); 348 break; 349 350 default: 351 break; 352 } 353 354 switch (inst->alu.mul.op) { 355 case V3D_QPU_M_MULTOP: 356 case V3D_QPU_M_UMUL24: 357 /* MULTOP sets rtop, and UMUL24 implicitly reads rtop and 358 * resets it to 0. We could possibly reorder umul24s relative 359 * to each other, but for now just keep all the MUL parts in 360 * order. 361 */ 362 add_write_dep(state, &state->last_rtop, n); 363 break; 364 default: 365 break; 366 } 367 368 if (inst->alu.add.op != V3D_QPU_A_NOP) { 369 process_waddr_deps(state, n, inst->alu.add.waddr, 370 inst->alu.add.magic_write); 371 } 372 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 373 process_waddr_deps(state, n, inst->alu.mul.waddr, 374 inst->alu.mul.magic_write); 375 } 376 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig)) { 377 process_waddr_deps(state, n, inst->sig_addr, 378 inst->sig_magic); 379 } 380 381 if (v3d_qpu_writes_r3(devinfo, inst)) 382 add_write_dep(state, &state->last_r[3], n); 383 if (v3d_qpu_writes_r4(devinfo, inst)) 384 add_write_dep(state, &state->last_r[4], n); 385 if (v3d_qpu_writes_r5(devinfo, inst)) 386 add_write_dep(state, &state->last_r[5], n); 387 388 if (inst->sig.thrsw) { 389 /* All accumulator contents and flags are undefined after the 390 * switch. 391 */ 392 for (int i = 0; i < ARRAY_SIZE(state->last_r); i++) 393 add_write_dep(state, &state->last_r[i], n); 394 add_write_dep(state, &state->last_sf, n); 395 396 /* Scoreboard-locking operations have to stay after the last 397 * thread switch. 398 */ 399 add_write_dep(state, &state->last_tlb, n); 400 401 add_write_dep(state, &state->last_tmu_write, n); 402 add_write_dep(state, &state->last_tmu_config, n); 403 } 404 405 if (inst->sig.ldtmu) { 406 /* TMU loads are coming from a FIFO, so ordering is important. 407 */ 408 add_write_dep(state, &state->last_tmu_write, n); 409 } 410 411 if (inst->sig.wrtmuc) 412 add_write_dep(state, &state->last_tmu_config, n); 413 414 if (inst->sig.ldtlb | inst->sig.ldtlbu) 415 add_read_dep(state, state->last_tlb, n); 416 417 if (inst->sig.ldvpm) 418 add_write_dep(state, &state->last_vpm_read, n); 419 420 /* inst->sig.ldunif or sideband uniform read */ 421 if (qinst->uniform != ~0) 422 add_write_dep(state, &state->last_unif, n); 423 424 process_cond_deps(state, n, inst->flags.ac); 425 process_cond_deps(state, n, inst->flags.mc); 426 process_pf_deps(state, n, inst->flags.apf); 427 process_pf_deps(state, n, inst->flags.mpf); 428 process_uf_deps(state, n, inst->flags.auf); 429 process_uf_deps(state, n, inst->flags.muf); 430 } 431 432 static void 433 calculate_forward_deps(struct v3d_compile *c, struct list_head *schedule_list) 434 { 435 struct schedule_state state; 436 437 memset(&state, 0, sizeof(state)); 438 state.devinfo = c->devinfo; 439 state.dir = F; 440 441 list_for_each_entry(struct schedule_node, node, schedule_list, link) 442 calculate_deps(&state, node); 443 } 444 445 static void 446 calculate_reverse_deps(struct v3d_compile *c, struct list_head *schedule_list) 447 { 448 struct list_head *node; 449 struct schedule_state state; 450 451 memset(&state, 0, sizeof(state)); 452 state.devinfo = c->devinfo; 453 state.dir = R; 454 455 for (node = schedule_list->prev; schedule_list != node; node = node->prev) { 456 calculate_deps(&state, (struct schedule_node *)node); 457 } 458 } 459 460 struct choose_scoreboard { 461 int tick; 462 int last_sfu_write_tick; 463 int last_ldvary_tick; 464 int last_uniforms_reset_tick; 465 uint32_t last_waddr_add, last_waddr_mul; 466 bool tlb_locked; 467 }; 468 469 static bool 470 mux_reads_too_soon(struct choose_scoreboard *scoreboard, 471 const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux) 472 { 473 switch (mux) { 474 case V3D_QPU_MUX_A: 475 if (scoreboard->last_waddr_add == inst->raddr_a || 476 scoreboard->last_waddr_mul == inst->raddr_a) { 477 return true; 478 } 479 break; 480 481 case V3D_QPU_MUX_B: 482 if (scoreboard->last_waddr_add == inst->raddr_b || 483 scoreboard->last_waddr_mul == inst->raddr_b) { 484 return true; 485 } 486 break; 487 488 case V3D_QPU_MUX_R4: 489 if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2) 490 return true; 491 break; 492 493 case V3D_QPU_MUX_R5: 494 if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1) 495 return true; 496 break; 497 default: 498 break; 499 } 500 501 return false; 502 } 503 504 static bool 505 reads_too_soon_after_write(struct choose_scoreboard *scoreboard, 506 struct qinst *qinst) 507 { 508 const struct v3d_qpu_instr *inst = &qinst->qpu; 509 510 /* XXX: Branching off of raddr. */ 511 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 512 return false; 513 514 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 515 516 if (inst->alu.add.op != V3D_QPU_A_NOP) { 517 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 && 518 mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) { 519 return true; 520 } 521 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 && 522 mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) { 523 return true; 524 } 525 } 526 527 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 528 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 && 529 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) { 530 return true; 531 } 532 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 && 533 mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) { 534 return true; 535 } 536 } 537 538 /* XXX: imm */ 539 540 return false; 541 } 542 543 static bool 544 writes_too_soon_after_write(const struct v3d_device_info *devinfo, 545 struct choose_scoreboard *scoreboard, 546 struct qinst *qinst) 547 { 548 const struct v3d_qpu_instr *inst = &qinst->qpu; 549 550 /* Don't schedule any other r4 write too soon after an SFU write. 551 * This would normally be prevented by dependency tracking, but might 552 * occur if a dead SFU computation makes it to scheduling. 553 */ 554 if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 && 555 v3d_qpu_writes_r4(devinfo, inst)) 556 return true; 557 558 return false; 559 } 560 561 static bool 562 pixel_scoreboard_too_soon(struct choose_scoreboard *scoreboard, 563 const struct v3d_qpu_instr *inst) 564 { 565 return (scoreboard->tick == 0 && qpu_inst_is_tlb(inst)); 566 } 567 568 static int 569 get_instruction_priority(const struct v3d_qpu_instr *inst) 570 { 571 uint32_t baseline_score; 572 uint32_t next_score = 0; 573 574 /* Schedule TLB operations as late as possible, to get more 575 * parallelism between shaders. 576 */ 577 if (qpu_inst_is_tlb(inst)) 578 return next_score; 579 next_score++; 580 581 /* Schedule texture read results collection late to hide latency. */ 582 if (inst->sig.ldtmu) 583 return next_score; 584 next_score++; 585 586 /* Default score for things that aren't otherwise special. */ 587 baseline_score = next_score; 588 next_score++; 589 590 /* Schedule texture read setup early to hide their latency better. */ 591 if (inst->type == V3D_QPU_INSTR_TYPE_ALU && 592 ((inst->alu.add.magic_write && 593 v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr)) || 594 (inst->alu.mul.magic_write && 595 v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr)))) { 596 return next_score; 597 } 598 next_score++; 599 600 return baseline_score; 601 } 602 603 static bool 604 qpu_magic_waddr_is_periph(enum v3d_qpu_waddr waddr) 605 { 606 return (v3d_qpu_magic_waddr_is_tmu(waddr) || 607 v3d_qpu_magic_waddr_is_sfu(waddr) || 608 v3d_qpu_magic_waddr_is_tlb(waddr) || 609 v3d_qpu_magic_waddr_is_vpm(waddr) || 610 v3d_qpu_magic_waddr_is_tsy(waddr)); 611 } 612 613 static bool 614 qpu_accesses_peripheral(const struct v3d_qpu_instr *inst) 615 { 616 if (v3d_qpu_uses_vpm(inst)) 617 return true; 618 619 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 620 if (inst->alu.add.op != V3D_QPU_A_NOP && 621 inst->alu.add.magic_write && 622 qpu_magic_waddr_is_periph(inst->alu.add.waddr)) { 623 return true; 624 } 625 626 if (inst->alu.mul.op != V3D_QPU_M_NOP && 627 inst->alu.mul.magic_write && 628 qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) { 629 return true; 630 } 631 } 632 633 return (inst->sig.ldvpm || 634 inst->sig.ldtmu || 635 inst->sig.ldtlb || 636 inst->sig.ldtlbu || 637 inst->sig.wrtmuc); 638 } 639 640 static bool 641 qpu_merge_inst(const struct v3d_device_info *devinfo, 642 struct v3d_qpu_instr *result, 643 const struct v3d_qpu_instr *a, 644 const struct v3d_qpu_instr *b) 645 { 646 if (a->type != V3D_QPU_INSTR_TYPE_ALU || 647 b->type != V3D_QPU_INSTR_TYPE_ALU) { 648 return false; 649 } 650 651 /* Can't do more than one peripheral access in an instruction. 652 * 653 * XXX: V3D 4.1 allows TMU read along with a VPM read or write, and 654 * WRTMUC with a TMU magic register write (other than tmuc). 655 */ 656 if (qpu_accesses_peripheral(a) && qpu_accesses_peripheral(b)) 657 return false; 658 659 struct v3d_qpu_instr merge = *a; 660 661 if (b->alu.add.op != V3D_QPU_A_NOP) { 662 if (a->alu.add.op != V3D_QPU_A_NOP) 663 return false; 664 merge.alu.add = b->alu.add; 665 666 merge.flags.ac = b->flags.ac; 667 merge.flags.apf = b->flags.apf; 668 merge.flags.auf = b->flags.auf; 669 } 670 671 if (b->alu.mul.op != V3D_QPU_M_NOP) { 672 if (a->alu.mul.op != V3D_QPU_M_NOP) 673 return false; 674 merge.alu.mul = b->alu.mul; 675 676 merge.flags.mc = b->flags.mc; 677 merge.flags.mpf = b->flags.mpf; 678 merge.flags.muf = b->flags.muf; 679 } 680 681 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A)) { 682 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A) && 683 a->raddr_a != b->raddr_a) { 684 return false; 685 } 686 merge.raddr_a = b->raddr_a; 687 } 688 689 if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) { 690 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) && 691 a->raddr_b != b->raddr_b) { 692 return false; 693 } 694 merge.raddr_b = b->raddr_b; 695 } 696 697 merge.sig.thrsw |= b->sig.thrsw; 698 merge.sig.ldunif |= b->sig.ldunif; 699 merge.sig.ldunifrf |= b->sig.ldunifrf; 700 merge.sig.ldunifa |= b->sig.ldunifa; 701 merge.sig.ldunifarf |= b->sig.ldunifarf; 702 merge.sig.ldtmu |= b->sig.ldtmu; 703 merge.sig.ldvary |= b->sig.ldvary; 704 merge.sig.ldvpm |= b->sig.ldvpm; 705 merge.sig.small_imm |= b->sig.small_imm; 706 merge.sig.ldtlb |= b->sig.ldtlb; 707 merge.sig.ldtlbu |= b->sig.ldtlbu; 708 merge.sig.ucb |= b->sig.ucb; 709 merge.sig.rotate |= b->sig.rotate; 710 merge.sig.wrtmuc |= b->sig.wrtmuc; 711 712 if (v3d_qpu_sig_writes_address(devinfo, &a->sig) && 713 v3d_qpu_sig_writes_address(devinfo, &b->sig)) 714 return false; 715 merge.sig_addr |= b->sig_addr; 716 merge.sig_magic |= b->sig_magic; 717 718 uint64_t packed; 719 bool ok = v3d_qpu_instr_pack(devinfo, &merge, &packed); 720 721 *result = merge; 722 /* No modifying the real instructions on failure. */ 723 assert(ok || (a != result && b != result)); 724 725 return ok; 726 } 727 728 static struct schedule_node * 729 choose_instruction_to_schedule(const struct v3d_device_info *devinfo, 730 struct choose_scoreboard *scoreboard, 731 struct list_head *schedule_list, 732 struct schedule_node *prev_inst) 733 { 734 struct schedule_node *chosen = NULL; 735 int chosen_prio = 0; 736 737 /* Don't pair up anything with a thread switch signal -- emit_thrsw() 738 * will handle pairing it along with filling the delay slots. 739 */ 740 if (prev_inst) { 741 if (prev_inst->inst->qpu.sig.thrsw) 742 return NULL; 743 } 744 745 list_for_each_entry(struct schedule_node, n, schedule_list, link) { 746 const struct v3d_qpu_instr *inst = &n->inst->qpu; 747 748 /* Don't choose the branch instruction until it's the last one 749 * left. We'll move it up to fit its delay slots after we 750 * choose it. 751 */ 752 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH && 753 !list_is_singular(schedule_list)) { 754 continue; 755 } 756 757 /* "An instruction must not read from a location in physical 758 * regfile A or B that was written to by the previous 759 * instruction." 760 */ 761 if (reads_too_soon_after_write(scoreboard, n->inst)) 762 continue; 763 764 if (writes_too_soon_after_write(devinfo, scoreboard, n->inst)) 765 continue; 766 767 /* "A scoreboard wait must not occur in the first two 768 * instructions of a fragment shader. This is either the 769 * explicit Wait for Scoreboard signal or an implicit wait 770 * with the first tile-buffer read or write instruction." 771 */ 772 if (pixel_scoreboard_too_soon(scoreboard, inst)) 773 continue; 774 775 /* ldunif and ldvary both write r5, but ldunif does so a tick 776 * sooner. If the ldvary's r5 wasn't used, then ldunif might 777 * otherwise get scheduled so ldunif and ldvary try to update 778 * r5 in the same tick. 779 */ 780 if ((inst->sig.ldunif || inst->sig.ldunifa) && 781 scoreboard->tick == scoreboard->last_ldvary_tick + 1) { 782 continue; 783 } 784 785 /* If we're trying to pair with another instruction, check 786 * that they're compatible. 787 */ 788 if (prev_inst) { 789 /* Don't pair up a thread switch signal -- we'll 790 * handle pairing it when we pick it on its own. 791 */ 792 if (inst->sig.thrsw) 793 continue; 794 795 if (prev_inst->inst->uniform != -1 && 796 n->inst->uniform != -1) 797 continue; 798 799 /* Don't merge in something that will lock the TLB. 800 * Hopwefully what we have in inst will release some 801 * other instructions, allowing us to delay the 802 * TLB-locking instruction until later. 803 */ 804 if (!scoreboard->tlb_locked && qpu_inst_is_tlb(inst)) 805 continue; 806 807 struct v3d_qpu_instr merged_inst; 808 if (!qpu_merge_inst(devinfo, &merged_inst, 809 &prev_inst->inst->qpu, inst)) { 810 continue; 811 } 812 } 813 814 int prio = get_instruction_priority(inst); 815 816 /* Found a valid instruction. If nothing better comes along, 817 * this one works. 818 */ 819 if (!chosen) { 820 chosen = n; 821 chosen_prio = prio; 822 continue; 823 } 824 825 if (prio > chosen_prio) { 826 chosen = n; 827 chosen_prio = prio; 828 } else if (prio < chosen_prio) { 829 continue; 830 } 831 832 if (n->delay > chosen->delay) { 833 chosen = n; 834 chosen_prio = prio; 835 } else if (n->delay < chosen->delay) { 836 continue; 837 } 838 } 839 840 return chosen; 841 } 842 843 static void 844 update_scoreboard_for_magic_waddr(struct choose_scoreboard *scoreboard, 845 enum v3d_qpu_waddr waddr) 846 { 847 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 848 scoreboard->last_sfu_write_tick = scoreboard->tick; 849 } 850 851 static void 852 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, 853 const struct v3d_qpu_instr *inst) 854 { 855 scoreboard->last_waddr_add = ~0; 856 scoreboard->last_waddr_mul = ~0; 857 858 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 859 return; 860 861 assert(inst->type == V3D_QPU_INSTR_TYPE_ALU); 862 863 if (inst->alu.add.op != V3D_QPU_A_NOP) { 864 if (inst->alu.add.magic_write) { 865 update_scoreboard_for_magic_waddr(scoreboard, 866 inst->alu.add.waddr); 867 } else { 868 scoreboard->last_waddr_add = inst->alu.add.waddr; 869 } 870 } 871 872 if (inst->alu.mul.op != V3D_QPU_M_NOP) { 873 if (inst->alu.mul.magic_write) { 874 update_scoreboard_for_magic_waddr(scoreboard, 875 inst->alu.mul.waddr); 876 } else { 877 scoreboard->last_waddr_mul = inst->alu.mul.waddr; 878 } 879 } 880 881 if (inst->sig.ldvary) 882 scoreboard->last_ldvary_tick = scoreboard->tick; 883 884 if (qpu_inst_is_tlb(inst)) 885 scoreboard->tlb_locked = true; 886 } 887 888 static void 889 dump_state(const struct v3d_device_info *devinfo, 890 struct list_head *schedule_list) 891 { 892 list_for_each_entry(struct schedule_node, n, schedule_list, link) { 893 fprintf(stderr, " t=%4d: ", n->unblocked_time); 894 v3d_qpu_dump(devinfo, &n->inst->qpu); 895 fprintf(stderr, "\n"); 896 897 for (int i = 0; i < n->child_count; i++) { 898 struct schedule_node *child = n->children[i].node; 899 if (!child) 900 continue; 901 902 fprintf(stderr, " - "); 903 v3d_qpu_dump(devinfo, &child->inst->qpu); 904 fprintf(stderr, " (%d parents, %c)\n", 905 child->parent_count, 906 n->children[i].write_after_read ? 'w' : 'r'); 907 } 908 } 909 } 910 911 static uint32_t magic_waddr_latency(enum v3d_qpu_waddr waddr, 912 const struct v3d_qpu_instr *after) 913 { 914 /* Apply some huge latency between texture fetch requests and getting 915 * their results back. 916 * 917 * FIXME: This is actually pretty bogus. If we do: 918 * 919 * mov tmu0_s, a 920 * <a bit of math> 921 * mov tmu0_s, b 922 * load_tmu0 923 * <more math> 924 * load_tmu0 925 * 926 * we count that as worse than 927 * 928 * mov tmu0_s, a 929 * mov tmu0_s, b 930 * <lots of math> 931 * load_tmu0 932 * <more math> 933 * load_tmu0 934 * 935 * because we associate the first load_tmu0 with the *second* tmu0_s. 936 */ 937 if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu) 938 return 100; 939 940 /* Assume that anything depending on us is consuming the SFU result. */ 941 if (v3d_qpu_magic_waddr_is_sfu(waddr)) 942 return 3; 943 944 return 1; 945 } 946 947 static uint32_t 948 instruction_latency(struct schedule_node *before, struct schedule_node *after) 949 { 950 const struct v3d_qpu_instr *before_inst = &before->inst->qpu; 951 const struct v3d_qpu_instr *after_inst = &after->inst->qpu; 952 uint32_t latency = 1; 953 954 if (before_inst->type != V3D_QPU_INSTR_TYPE_ALU || 955 after_inst->type != V3D_QPU_INSTR_TYPE_ALU) 956 return latency; 957 958 if (before_inst->alu.add.magic_write) { 959 latency = MAX2(latency, 960 magic_waddr_latency(before_inst->alu.add.waddr, 961 after_inst)); 962 } 963 964 if (before_inst->alu.mul.magic_write) { 965 latency = MAX2(latency, 966 magic_waddr_latency(before_inst->alu.mul.waddr, 967 after_inst)); 968 } 969 970 return latency; 971 } 972 973 /** Recursive computation of the delay member of a node. */ 974 static void 975 compute_delay(struct schedule_node *n) 976 { 977 if (!n->child_count) { 978 n->delay = 1; 979 } else { 980 for (int i = 0; i < n->child_count; i++) { 981 if (!n->children[i].node->delay) 982 compute_delay(n->children[i].node); 983 n->delay = MAX2(n->delay, 984 n->children[i].node->delay + 985 instruction_latency(n, n->children[i].node)); 986 } 987 } 988 } 989 990 static void 991 mark_instruction_scheduled(struct list_head *schedule_list, 992 uint32_t time, 993 struct schedule_node *node, 994 bool war_only) 995 { 996 if (!node) 997 return; 998 999 for (int i = node->child_count - 1; i >= 0; i--) { 1000 struct schedule_node *child = 1001 node->children[i].node; 1002 1003 if (!child) 1004 continue; 1005 1006 if (war_only && !node->children[i].write_after_read) 1007 continue; 1008 1009 /* If the requirement is only that the node not appear before 1010 * the last read of its destination, then it can be scheduled 1011 * immediately after (or paired with!) the thing reading the 1012 * destination. 1013 */ 1014 uint32_t latency = 0; 1015 if (!war_only) { 1016 latency = instruction_latency(node, 1017 node->children[i].node); 1018 } 1019 1020 child->unblocked_time = MAX2(child->unblocked_time, 1021 time + latency); 1022 child->parent_count--; 1023 if (child->parent_count == 0) 1024 list_add(&child->link, schedule_list); 1025 1026 node->children[i].node = NULL; 1027 } 1028 } 1029 1030 static void 1031 insert_scheduled_instruction(struct v3d_compile *c, 1032 struct qblock *block, 1033 struct choose_scoreboard *scoreboard, 1034 struct qinst *inst) 1035 { 1036 list_addtail(&inst->link, &block->instructions); 1037 1038 update_scoreboard_for_chosen(scoreboard, &inst->qpu); 1039 c->qpu_inst_count++; 1040 scoreboard->tick++; 1041 } 1042 1043 static struct qinst * 1044 vir_nop() 1045 { 1046 struct qreg undef = { QFILE_NULL, 0 }; 1047 struct qinst *qinst = vir_add_inst(V3D_QPU_A_NOP, undef, undef, undef); 1048 1049 return qinst; 1050 } 1051 1052 static void 1053 emit_nop(struct v3d_compile *c, struct qblock *block, 1054 struct choose_scoreboard *scoreboard) 1055 { 1056 insert_scheduled_instruction(c, block, scoreboard, vir_nop()); 1057 } 1058 1059 static bool 1060 qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c, 1061 const struct qinst *qinst, int slot) 1062 { 1063 const struct v3d_qpu_instr *inst = &qinst->qpu; 1064 1065 /* Only TLB Z writes are prohibited in the last slot, but we don't 1066 * have those flagged so prohibit all TLB ops for now. 1067 */ 1068 if (slot == 2 && qpu_inst_is_tlb(inst)) 1069 return false; 1070 1071 if (slot > 0 && qinst->uniform != ~0) 1072 return false; 1073 1074 if (v3d_qpu_uses_vpm(inst)) 1075 return false; 1076 1077 if (inst->sig.ldvary) 1078 return false; 1079 1080 if (inst->type == V3D_QPU_INSTR_TYPE_ALU) { 1081 /* No writing physical registers at the end. */ 1082 if (!inst->alu.add.magic_write || 1083 !inst->alu.mul.magic_write) { 1084 return false; 1085 } 1086 1087 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF) 1088 return false; 1089 1090 /* RF0-2 might be overwritten during the delay slots by 1091 * fragment shader setup. 1092 */ 1093 if (inst->raddr_a < 3 && 1094 (inst->alu.add.a == V3D_QPU_MUX_A || 1095 inst->alu.add.b == V3D_QPU_MUX_A || 1096 inst->alu.mul.a == V3D_QPU_MUX_A || 1097 inst->alu.mul.b == V3D_QPU_MUX_A)) { 1098 return false; 1099 } 1100 1101 if (inst->raddr_b < 3 && 1102 !inst->sig.small_imm && 1103 (inst->alu.add.a == V3D_QPU_MUX_B || 1104 inst->alu.add.b == V3D_QPU_MUX_B || 1105 inst->alu.mul.a == V3D_QPU_MUX_B || 1106 inst->alu.mul.b == V3D_QPU_MUX_B)) { 1107 return false; 1108 } 1109 } 1110 1111 return true; 1112 } 1113 1114 static bool 1115 valid_thrsw_sequence(struct v3d_compile *c, 1116 struct qinst *qinst, int instructions_in_sequence, 1117 bool is_thrend) 1118 { 1119 for (int slot = 0; slot < instructions_in_sequence; slot++) { 1120 /* No scheduling SFU when the result would land in the other 1121 * thread. The simulator complains for safety, though it 1122 * would only occur for dead code in our case. 1123 */ 1124 if (slot > 0 && 1125 qinst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 1126 (v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.add.waddr) || 1127 v3d_qpu_magic_waddr_is_sfu(qinst->qpu.alu.mul.waddr))) { 1128 return false; 1129 } 1130 1131 if (slot > 0 && qinst->qpu.sig.ldvary) 1132 return false; 1133 1134 if (is_thrend && 1135 !qpu_instruction_valid_in_thrend_slot(c, qinst, slot)) { 1136 return false; 1137 } 1138 1139 /* Note that the list is circular, so we can only do this up 1140 * to instructions_in_sequence. 1141 */ 1142 qinst = (struct qinst *)qinst->link.next; 1143 } 1144 1145 return true; 1146 } 1147 1148 /** 1149 * Emits a THRSW signal in the stream, trying to move it up to pair with 1150 * another instruction. 1151 */ 1152 static int 1153 emit_thrsw(struct v3d_compile *c, 1154 struct qblock *block, 1155 struct choose_scoreboard *scoreboard, 1156 struct qinst *inst, 1157 bool is_thrend) 1158 { 1159 int time = 0; 1160 1161 /* There should be nothing in a thrsw inst being scheduled other than 1162 * the signal bits. 1163 */ 1164 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 1165 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP); 1166 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP); 1167 1168 /* Find how far back into previous instructions we can put the THRSW. */ 1169 int slots_filled = 0; 1170 struct qinst *merge_inst = NULL; 1171 vir_for_each_inst_rev(prev_inst, block) { 1172 struct v3d_qpu_sig sig = prev_inst->qpu.sig; 1173 sig.thrsw = true; 1174 uint32_t packed_sig; 1175 1176 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig)) 1177 break; 1178 1179 if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1, 1180 is_thrend)) { 1181 break; 1182 } 1183 1184 merge_inst = prev_inst; 1185 if (++slots_filled == 3) 1186 break; 1187 } 1188 1189 bool needs_free = false; 1190 if (merge_inst) { 1191 merge_inst->qpu.sig.thrsw = true; 1192 needs_free = true; 1193 } else { 1194 insert_scheduled_instruction(c, block, scoreboard, inst); 1195 time++; 1196 slots_filled++; 1197 merge_inst = inst; 1198 } 1199 1200 /* Insert any extra delay slot NOPs we need. */ 1201 for (int i = 0; i < 3 - slots_filled; i++) { 1202 emit_nop(c, block, scoreboard); 1203 time++; 1204 } 1205 1206 /* If we're emitting the last THRSW (other than program end), then 1207 * signal that to the HW by emitting two THRSWs in a row. 1208 */ 1209 if (inst->is_last_thrsw) { 1210 struct qinst *second_inst = 1211 (struct qinst *)merge_inst->link.next; 1212 second_inst->qpu.sig.thrsw = true; 1213 } 1214 1215 /* If we put our THRSW into another instruction, free up the 1216 * instruction that didn't end up scheduled into the list. 1217 */ 1218 if (needs_free) 1219 free(inst); 1220 1221 return time; 1222 } 1223 1224 static uint32_t 1225 schedule_instructions(struct v3d_compile *c, 1226 struct choose_scoreboard *scoreboard, 1227 struct qblock *block, 1228 struct list_head *schedule_list, 1229 enum quniform_contents *orig_uniform_contents, 1230 uint32_t *orig_uniform_data, 1231 uint32_t *next_uniform) 1232 { 1233 const struct v3d_device_info *devinfo = c->devinfo; 1234 uint32_t time = 0; 1235 1236 if (debug) { 1237 fprintf(stderr, "initial deps:\n"); 1238 dump_state(devinfo, schedule_list); 1239 fprintf(stderr, "\n"); 1240 } 1241 1242 /* Remove non-DAG heads from the list. */ 1243 list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) { 1244 if (n->parent_count != 0) 1245 list_del(&n->link); 1246 } 1247 1248 while (!list_empty(schedule_list)) { 1249 struct schedule_node *chosen = 1250 choose_instruction_to_schedule(devinfo, 1251 scoreboard, 1252 schedule_list, 1253 NULL); 1254 struct schedule_node *merge = NULL; 1255 1256 /* If there are no valid instructions to schedule, drop a NOP 1257 * in. 1258 */ 1259 struct qinst *qinst = chosen ? chosen->inst : vir_nop(); 1260 struct v3d_qpu_instr *inst = &qinst->qpu; 1261 1262 if (debug) { 1263 fprintf(stderr, "t=%4d: current list:\n", 1264 time); 1265 dump_state(devinfo, schedule_list); 1266 fprintf(stderr, "t=%4d: chose: ", time); 1267 v3d_qpu_dump(devinfo, inst); 1268 fprintf(stderr, "\n"); 1269 } 1270 1271 /* Schedule this instruction onto the QPU list. Also try to 1272 * find an instruction to pair with it. 1273 */ 1274 if (chosen) { 1275 time = MAX2(chosen->unblocked_time, time); 1276 list_del(&chosen->link); 1277 mark_instruction_scheduled(schedule_list, time, 1278 chosen, true); 1279 1280 merge = choose_instruction_to_schedule(devinfo, 1281 scoreboard, 1282 schedule_list, 1283 chosen); 1284 if (merge) { 1285 time = MAX2(merge->unblocked_time, time); 1286 list_del(&merge->link); 1287 (void)qpu_merge_inst(devinfo, inst, 1288 inst, &merge->inst->qpu); 1289 if (merge->inst->uniform != -1) { 1290 chosen->inst->uniform = 1291 merge->inst->uniform; 1292 } 1293 1294 if (debug) { 1295 fprintf(stderr, "t=%4d: merging: ", 1296 time); 1297 v3d_qpu_dump(devinfo, &merge->inst->qpu); 1298 fprintf(stderr, "\n"); 1299 fprintf(stderr, " result: "); 1300 v3d_qpu_dump(devinfo, inst); 1301 fprintf(stderr, "\n"); 1302 } 1303 } 1304 } 1305 1306 /* Update the uniform index for the rewritten location -- 1307 * branch target updating will still need to change 1308 * c->uniform_data[] using this index. 1309 */ 1310 if (qinst->uniform != -1) { 1311 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) 1312 block->branch_uniform = *next_uniform; 1313 1314 c->uniform_data[*next_uniform] = 1315 orig_uniform_data[qinst->uniform]; 1316 c->uniform_contents[*next_uniform] = 1317 orig_uniform_contents[qinst->uniform]; 1318 qinst->uniform = *next_uniform; 1319 (*next_uniform)++; 1320 } 1321 1322 if (debug) { 1323 fprintf(stderr, "\n"); 1324 } 1325 1326 /* Now that we've scheduled a new instruction, some of its 1327 * children can be promoted to the list of instructions ready to 1328 * be scheduled. Update the children's unblocked time for this 1329 * DAG edge as we do so. 1330 */ 1331 mark_instruction_scheduled(schedule_list, time, chosen, false); 1332 1333 if (merge) { 1334 mark_instruction_scheduled(schedule_list, time, merge, 1335 false); 1336 1337 /* The merged VIR instruction doesn't get re-added to the 1338 * block, so free it now. 1339 */ 1340 free(merge->inst); 1341 } 1342 1343 if (inst->sig.thrsw) { 1344 time += emit_thrsw(c, block, scoreboard, qinst, false); 1345 } else { 1346 insert_scheduled_instruction(c, block, 1347 scoreboard, qinst); 1348 1349 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) { 1350 block->branch_qpu_ip = c->qpu_inst_count - 1; 1351 /* Fill the delay slots. 1352 * 1353 * We should fill these with actual instructions, 1354 * instead, but that will probably need to be done 1355 * after this, once we know what the leading 1356 * instructions of the successors are (so we can 1357 * handle A/B register file write latency) 1358 */ 1359 for (int i = 0; i < 3; i++) 1360 emit_nop(c, block, scoreboard); 1361 } 1362 } 1363 } 1364 1365 return time; 1366 } 1367 1368 static uint32_t 1369 qpu_schedule_instructions_block(struct v3d_compile *c, 1370 struct choose_scoreboard *scoreboard, 1371 struct qblock *block, 1372 enum quniform_contents *orig_uniform_contents, 1373 uint32_t *orig_uniform_data, 1374 uint32_t *next_uniform) 1375 { 1376 void *mem_ctx = ralloc_context(NULL); 1377 struct list_head schedule_list; 1378 1379 list_inithead(&schedule_list); 1380 1381 /* Wrap each instruction in a scheduler structure. */ 1382 while (!list_empty(&block->instructions)) { 1383 struct qinst *qinst = (struct qinst *)block->instructions.next; 1384 struct schedule_node *n = 1385 rzalloc(mem_ctx, struct schedule_node); 1386 1387 n->inst = qinst; 1388 1389 list_del(&qinst->link); 1390 list_addtail(&n->link, &schedule_list); 1391 } 1392 1393 calculate_forward_deps(c, &schedule_list); 1394 calculate_reverse_deps(c, &schedule_list); 1395 1396 list_for_each_entry(struct schedule_node, n, &schedule_list, link) { 1397 compute_delay(n); 1398 } 1399 1400 uint32_t cycles = schedule_instructions(c, scoreboard, block, 1401 &schedule_list, 1402 orig_uniform_contents, 1403 orig_uniform_data, 1404 next_uniform); 1405 1406 ralloc_free(mem_ctx); 1407 1408 return cycles; 1409 } 1410 1411 static void 1412 qpu_set_branch_targets(struct v3d_compile *c) 1413 { 1414 vir_for_each_block(block, c) { 1415 /* The end block of the program has no branch. */ 1416 if (!block->successors[0]) 1417 continue; 1418 1419 /* If there was no branch instruction, then the successor 1420 * block must follow immediately after this one. 1421 */ 1422 if (block->branch_qpu_ip == ~0) { 1423 assert(block->end_qpu_ip + 1 == 1424 block->successors[0]->start_qpu_ip); 1425 continue; 1426 } 1427 1428 /* Walk back through the delay slots to find the branch 1429 * instr. 1430 */ 1431 struct list_head *entry = block->instructions.prev; 1432 for (int i = 0; i < 3; i++) 1433 entry = entry->prev; 1434 struct qinst *branch = container_of(entry, branch, link); 1435 assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH); 1436 1437 /* Make sure that the if-we-don't-jump 1438 * successor was scheduled just after the 1439 * delay slots. 1440 */ 1441 assert(!block->successors[1] || 1442 block->successors[1]->start_qpu_ip == 1443 block->branch_qpu_ip + 4); 1444 1445 branch->qpu.branch.offset = 1446 ((block->successors[0]->start_qpu_ip - 1447 (block->branch_qpu_ip + 4)) * 1448 sizeof(uint64_t)); 1449 1450 /* Set up the relative offset to jump in the 1451 * uniform stream. 1452 * 1453 * Use a temporary here, because 1454 * uniform_data[inst->uniform] may be shared 1455 * between multiple instructions. 1456 */ 1457 assert(c->uniform_contents[branch->uniform] == QUNIFORM_CONSTANT); 1458 c->uniform_data[branch->uniform] = 1459 (block->successors[0]->start_uniform - 1460 (block->branch_uniform + 1)) * 4; 1461 } 1462 } 1463 1464 uint32_t 1465 v3d_qpu_schedule_instructions(struct v3d_compile *c) 1466 { 1467 const struct v3d_device_info *devinfo = c->devinfo; 1468 struct qblock *end_block = list_last_entry(&c->blocks, 1469 struct qblock, link); 1470 1471 /* We reorder the uniforms as we schedule instructions, so save the 1472 * old data off and replace it. 1473 */ 1474 uint32_t *uniform_data = c->uniform_data; 1475 enum quniform_contents *uniform_contents = c->uniform_contents; 1476 c->uniform_contents = ralloc_array(c, enum quniform_contents, 1477 c->num_uniforms); 1478 c->uniform_data = ralloc_array(c, uint32_t, c->num_uniforms); 1479 c->uniform_array_size = c->num_uniforms; 1480 uint32_t next_uniform = 0; 1481 1482 struct choose_scoreboard scoreboard; 1483 memset(&scoreboard, 0, sizeof(scoreboard)); 1484 scoreboard.last_waddr_add = ~0; 1485 scoreboard.last_waddr_mul = ~0; 1486 scoreboard.last_ldvary_tick = -10; 1487 scoreboard.last_sfu_write_tick = -10; 1488 scoreboard.last_uniforms_reset_tick = -10; 1489 1490 if (debug) { 1491 fprintf(stderr, "Pre-schedule instructions\n"); 1492 vir_for_each_block(block, c) { 1493 fprintf(stderr, "BLOCK %d\n", block->index); 1494 list_for_each_entry(struct qinst, qinst, 1495 &block->instructions, link) { 1496 v3d_qpu_dump(devinfo, &qinst->qpu); 1497 fprintf(stderr, "\n"); 1498 } 1499 } 1500 fprintf(stderr, "\n"); 1501 } 1502 1503 uint32_t cycles = 0; 1504 vir_for_each_block(block, c) { 1505 block->start_qpu_ip = c->qpu_inst_count; 1506 block->branch_qpu_ip = ~0; 1507 block->start_uniform = next_uniform; 1508 1509 cycles += qpu_schedule_instructions_block(c, 1510 &scoreboard, 1511 block, 1512 uniform_contents, 1513 uniform_data, 1514 &next_uniform); 1515 1516 block->end_qpu_ip = c->qpu_inst_count - 1; 1517 } 1518 1519 /* Emit the program-end THRSW instruction. */; 1520 struct qinst *thrsw = vir_nop(); 1521 thrsw->qpu.sig.thrsw = true; 1522 emit_thrsw(c, end_block, &scoreboard, thrsw, true); 1523 1524 qpu_set_branch_targets(c); 1525 1526 assert(next_uniform == c->num_uniforms); 1527 1528 return cycles; 1529 } 1530