1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ 2 3 /* 4 * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 * Authors: 26 * Rob Clark <robclark (at) freedesktop.org> 27 */ 28 29 30 #include "util/u_math.h" 31 32 #include "ir3.h" 33 34 /* 35 * Instruction Scheduling: 36 * 37 * A recursive depth based scheduling algo. Recursively find an eligible 38 * instruction to schedule from the deepest instruction (recursing through 39 * it's unscheduled src instructions). Normally this would result in a 40 * lot of re-traversal of the same instructions, so we cache results in 41 * instr->data (and clear cached results that would be no longer valid 42 * after scheduling an instruction). 43 * 44 * There are a few special cases that need to be handled, since sched 45 * is currently independent of register allocation. Usages of address 46 * register (a0.x) or predicate register (p0.x) must be serialized. Ie. 47 * if you have two pairs of instructions that write the same special 48 * register and then read it, then those pairs cannot be interleaved. 49 * To solve this, when we are in such a scheduling "critical section", 50 * and we encounter a conflicting write to a special register, we try 51 * to schedule any remaining instructions that use that value first. 52 */ 53 54 struct ir3_sched_ctx { 55 struct ir3_block *block; /* the current block */ 56 struct list_head depth_list; /* depth sorted unscheduled instrs */ 57 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/ 58 struct ir3_instruction *addr; /* current a0.x user, if any */ 59 struct ir3_instruction *pred; /* current p0.x user, if any */ 60 bool error; 61 }; 62 63 static bool is_sfu_or_mem(struct ir3_instruction *instr) 64 { 65 return is_sfu(instr) || is_mem(instr); 66 } 67 68 #define NULL_INSTR ((void *)~0) 69 70 static void 71 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) 72 { 73 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) { 74 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr) 75 instr2->data = NULL; 76 } 77 } 78 79 static void 80 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) 81 { 82 debug_assert(ctx->block == instr->block); 83 84 /* maybe there is a better way to handle this than just stuffing 85 * a nop.. ideally we'd know about this constraint in the 86 * scheduling and depth calculation.. 87 */ 88 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr)) 89 ir3_NOP(ctx->block); 90 91 /* remove from depth list: 92 */ 93 list_delinit(&instr->node); 94 95 if (writes_addr(instr)) { 96 debug_assert(ctx->addr == NULL); 97 ctx->addr = instr; 98 } 99 100 if (writes_pred(instr)) { 101 debug_assert(ctx->pred == NULL); 102 ctx->pred = instr; 103 } 104 105 instr->flags |= IR3_INSTR_MARK; 106 107 list_addtail(&instr->node, &instr->block->instr_list); 108 ctx->scheduled = instr; 109 110 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) { 111 clear_cache(ctx, NULL); 112 } else { 113 /* invalidate only the necessary entries.. */ 114 clear_cache(ctx, instr); 115 } 116 } 117 118 static struct ir3_instruction * 119 deepest(struct ir3_instruction **srcs, unsigned nsrcs) 120 { 121 struct ir3_instruction *d = NULL; 122 unsigned i = 0, id = 0; 123 124 while ((i < nsrcs) && !(d = srcs[id = i])) 125 i++; 126 127 if (!d) 128 return NULL; 129 130 for (; i < nsrcs; i++) 131 if (srcs[i] && (srcs[i]->depth > d->depth)) 132 d = srcs[id = i]; 133 134 srcs[id] = NULL; 135 136 return d; 137 } 138 139 static unsigned 140 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr, 141 unsigned maxd) 142 { 143 struct list_head *instr_list = &ctx->block->instr_list; 144 unsigned d = 0; 145 146 list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) { 147 if ((n == instr) || (d >= maxd)) 148 break; 149 if (is_alu(n) || is_flow(n)) 150 d++; 151 } 152 153 return d; 154 } 155 156 /* calculate delay for specified src: */ 157 static unsigned 158 delay_calc_srcn(struct ir3_sched_ctx *ctx, 159 struct ir3_instruction *assigner, 160 struct ir3_instruction *consumer, unsigned srcn) 161 { 162 unsigned delay = 0; 163 164 if (is_meta(assigner)) { 165 struct ir3_instruction *src; 166 foreach_ssa_src(src, assigner) { 167 unsigned d; 168 if (src->block != assigner->block) 169 break; 170 d = delay_calc_srcn(ctx, src, consumer, srcn); 171 delay = MAX2(delay, d); 172 } 173 } else { 174 delay = ir3_delayslots(assigner, consumer, srcn); 175 delay -= distance(ctx, assigner, delay); 176 } 177 178 return delay; 179 } 180 181 /* calculate delay for instruction (maximum of delay for all srcs): */ 182 static unsigned 183 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) 184 { 185 unsigned delay = 0; 186 struct ir3_instruction *src; 187 188 foreach_ssa_src_n(src, i, instr) { 189 unsigned d; 190 /* for array writes, no need to delay on previous write: */ 191 if (i == 0) 192 continue; 193 if (src->block != instr->block) 194 continue; 195 d = delay_calc_srcn(ctx, src, instr, i); 196 delay = MAX2(delay, d); 197 } 198 199 return delay; 200 } 201 202 struct ir3_sched_notes { 203 /* there is at least one kill which could be scheduled, except 204 * for unscheduled bary.f's: 205 */ 206 bool blocked_kill; 207 /* there is at least one instruction that could be scheduled, 208 * except for conflicting address/predicate register usage: 209 */ 210 bool addr_conflict, pred_conflict; 211 }; 212 213 static bool is_scheduled(struct ir3_instruction *instr) 214 { 215 return !!(instr->flags & IR3_INSTR_MARK); 216 } 217 218 /* could an instruction be scheduled if specified ssa src was scheduled? */ 219 static bool 220 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src) 221 { 222 struct ir3_instruction *other_src; 223 foreach_ssa_src(other_src, instr) { 224 /* if dependency not scheduled, we aren't ready yet: */ 225 if ((src != other_src) && !is_scheduled(other_src)) { 226 return false; 227 } 228 } 229 return true; 230 } 231 232 /* Check if instruction is ok to schedule. Make sure it is not blocked 233 * by use of addr/predicate register, etc. 234 */ 235 static bool 236 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, 237 struct ir3_instruction *instr) 238 { 239 /* For instructions that write address register we need to 240 * make sure there is at least one instruction that uses the 241 * addr value which is otherwise ready. 242 * 243 * TODO if any instructions use pred register and have other 244 * src args, we would need to do the same for writes_pred().. 245 */ 246 if (writes_addr(instr)) { 247 struct ir3 *ir = instr->block->shader; 248 bool ready = false; 249 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) { 250 struct ir3_instruction *indirect = ir->indirects[i]; 251 if (!indirect) 252 continue; 253 if (indirect->address != instr) 254 continue; 255 ready = could_sched(indirect, instr); 256 } 257 258 /* nothing could be scheduled, so keep looking: */ 259 if (!ready) 260 return false; 261 } 262 263 /* if this is a write to address/predicate register, and that 264 * register is currently in use, we need to defer until it is 265 * free: 266 */ 267 if (writes_addr(instr) && ctx->addr) { 268 debug_assert(ctx->addr != instr); 269 notes->addr_conflict = true; 270 return false; 271 } 272 273 if (writes_pred(instr) && ctx->pred) { 274 debug_assert(ctx->pred != instr); 275 notes->pred_conflict = true; 276 return false; 277 } 278 279 /* if the instruction is a kill, we need to ensure *every* 280 * bary.f is scheduled. The hw seems unhappy if the thread 281 * gets killed before the end-input (ei) flag is hit. 282 * 283 * We could do this by adding each bary.f instruction as 284 * virtual ssa src for the kill instruction. But we have 285 * fixed length instr->regs[]. 286 * 287 * TODO this wouldn't be quite right if we had multiple 288 * basic blocks, if any block was conditional. We'd need 289 * to schedule the bary.f's outside of any block which 290 * was conditional that contained a kill.. I think.. 291 */ 292 if (is_kill(instr)) { 293 struct ir3 *ir = instr->block->shader; 294 295 for (unsigned i = 0; i < ir->baryfs_count; i++) { 296 struct ir3_instruction *baryf = ir->baryfs[i]; 297 if (baryf->flags & IR3_INSTR_UNUSED) 298 continue; 299 if (!is_scheduled(baryf)) { 300 notes->blocked_kill = true; 301 return false; 302 } 303 } 304 } 305 306 return true; 307 } 308 309 /* Find the best instruction to schedule from specified instruction or 310 * recursively it's ssa sources. 311 */ 312 static struct ir3_instruction * 313 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes, 314 struct ir3_instruction *instr) 315 { 316 struct ir3_instruction *srcs[__ssa_src_cnt(instr)]; 317 struct ir3_instruction *src; 318 unsigned nsrcs = 0; 319 320 if (is_scheduled(instr)) 321 return NULL; 322 323 /* use instr->data to cache the results of recursing up the 324 * instr src's. Otherwise the recursive algo can scale quite 325 * badly w/ shader size. But this takes some care to clear 326 * the cache appropriately when instructions are scheduled. 327 */ 328 if (instr->data) { 329 if (instr->data == NULL_INSTR) 330 return NULL; 331 return instr->data; 332 } 333 334 /* find unscheduled srcs: */ 335 foreach_ssa_src(src, instr) { 336 if (!is_scheduled(src)) { 337 debug_assert(nsrcs < ARRAY_SIZE(srcs)); 338 srcs[nsrcs++] = src; 339 } 340 } 341 342 /* if all our src's are already scheduled: */ 343 if (nsrcs == 0) { 344 if (check_instr(ctx, notes, instr)) { 345 instr->data = instr; 346 return instr; 347 } 348 return NULL; 349 } 350 351 while ((src = deepest(srcs, nsrcs))) { 352 struct ir3_instruction *candidate; 353 354 candidate = find_instr_recursive(ctx, notes, src); 355 if (!candidate) 356 continue; 357 358 if (check_instr(ctx, notes, candidate)) { 359 instr->data = candidate; 360 return candidate; 361 } 362 } 363 364 instr->data = NULL_INSTR; 365 return NULL; 366 } 367 368 /* find instruction to schedule: */ 369 static struct ir3_instruction * 370 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes) 371 { 372 struct ir3_instruction *best_instr = NULL; 373 unsigned min_delay = ~0; 374 375 /* TODO we'd really rather use the list/array of block outputs. But we 376 * don't have such a thing. Recursing *every* instruction in the list 377 * will result in a lot of repeated traversal, since instructions will 378 * get traversed both when they appear as ssa src to a later instruction 379 * as well as where they appear in the depth_list. 380 */ 381 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) { 382 struct ir3_instruction *candidate; 383 unsigned delay; 384 385 candidate = find_instr_recursive(ctx, notes, instr); 386 if (!candidate) 387 continue; 388 389 delay = delay_calc(ctx, candidate); 390 if (delay < min_delay) { 391 best_instr = candidate; 392 min_delay = delay; 393 } 394 395 if (min_delay == 0) 396 break; 397 } 398 399 return best_instr; 400 } 401 402 /* "spill" the address register by remapping any unscheduled 403 * instructions which depend on the current address register 404 * to a clone of the instruction which wrote the address reg. 405 */ 406 static struct ir3_instruction * 407 split_addr(struct ir3_sched_ctx *ctx) 408 { 409 struct ir3 *ir; 410 struct ir3_instruction *new_addr = NULL; 411 unsigned i; 412 413 debug_assert(ctx->addr); 414 415 ir = ctx->addr->block->shader; 416 417 for (i = 0; i < ir->indirects_count; i++) { 418 struct ir3_instruction *indirect = ir->indirects[i]; 419 420 if (!indirect) 421 continue; 422 423 /* skip instructions already scheduled: */ 424 if (is_scheduled(indirect)) 425 continue; 426 427 /* remap remaining instructions using current addr 428 * to new addr: 429 */ 430 if (indirect->address == ctx->addr) { 431 if (!new_addr) { 432 new_addr = ir3_instr_clone(ctx->addr); 433 /* original addr is scheduled, but new one isn't: */ 434 new_addr->flags &= ~IR3_INSTR_MARK; 435 } 436 ir3_instr_set_address(indirect, new_addr); 437 } 438 } 439 440 /* all remaining indirects remapped to new addr: */ 441 ctx->addr = NULL; 442 443 return new_addr; 444 } 445 446 /* "spill" the predicate register by remapping any unscheduled 447 * instructions which depend on the current predicate register 448 * to a clone of the instruction which wrote the address reg. 449 */ 450 static struct ir3_instruction * 451 split_pred(struct ir3_sched_ctx *ctx) 452 { 453 struct ir3 *ir; 454 struct ir3_instruction *new_pred = NULL; 455 unsigned i; 456 457 debug_assert(ctx->pred); 458 459 ir = ctx->pred->block->shader; 460 461 for (i = 0; i < ir->predicates_count; i++) { 462 struct ir3_instruction *predicated = ir->predicates[i]; 463 464 /* skip instructions already scheduled: */ 465 if (is_scheduled(predicated)) 466 continue; 467 468 /* remap remaining instructions using current pred 469 * to new pred: 470 * 471 * TODO is there ever a case when pred isn't first 472 * (and only) src? 473 */ 474 if (ssa(predicated->regs[1]) == ctx->pred) { 475 if (!new_pred) { 476 new_pred = ir3_instr_clone(ctx->pred); 477 /* original pred is scheduled, but new one isn't: */ 478 new_pred->flags &= ~IR3_INSTR_MARK; 479 } 480 predicated->regs[1]->instr = new_pred; 481 } 482 } 483 484 /* all remaining predicated remapped to new pred: */ 485 ctx->pred = NULL; 486 487 return new_pred; 488 } 489 490 static void 491 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) 492 { 493 struct list_head unscheduled_list; 494 495 ctx->block = block; 496 497 /* addr/pred writes are per-block: */ 498 ctx->addr = NULL; 499 ctx->pred = NULL; 500 501 /* move all instructions to the unscheduled list, and 502 * empty the block's instruction list (to which we will 503 * be inserting). 504 */ 505 list_replace(&block->instr_list, &unscheduled_list); 506 list_inithead(&block->instr_list); 507 list_inithead(&ctx->depth_list); 508 509 /* first a pre-pass to schedule all meta:input/phi instructions 510 * (which need to appear first so that RA knows the register is 511 * occupied), and move remaining to depth sorted list: 512 */ 513 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) { 514 if ((instr->opc == OPC_META_INPUT) || (instr->opc == OPC_META_PHI)) { 515 schedule(ctx, instr); 516 } else { 517 ir3_insert_by_depth(instr, &ctx->depth_list); 518 } 519 } 520 521 while (!list_empty(&ctx->depth_list)) { 522 struct ir3_sched_notes notes = {0}; 523 struct ir3_instruction *instr; 524 525 instr = find_eligible_instr(ctx, ¬es); 526 527 if (instr) { 528 unsigned delay = delay_calc(ctx, instr); 529 530 /* and if we run out of instructions that can be scheduled, 531 * then it is time for nop's: 532 */ 533 debug_assert(delay <= 6); 534 while (delay > 0) { 535 ir3_NOP(block); 536 delay--; 537 } 538 539 schedule(ctx, instr); 540 } else { 541 struct ir3_instruction *new_instr = NULL; 542 543 /* nothing available to schedule.. if we are blocked on 544 * address/predicate register conflict, then break the 545 * deadlock by cloning the instruction that wrote that 546 * reg: 547 */ 548 if (notes.addr_conflict) { 549 new_instr = split_addr(ctx); 550 } else if (notes.pred_conflict) { 551 new_instr = split_pred(ctx); 552 } else { 553 debug_assert(0); 554 ctx->error = true; 555 return; 556 } 557 558 if (new_instr) { 559 /* clearing current addr/pred can change what is 560 * available to schedule, so clear cache.. 561 */ 562 clear_cache(ctx, NULL); 563 564 ir3_insert_by_depth(new_instr, &ctx->depth_list); 565 /* the original instr that wrote addr/pred may have 566 * originated from a different block: 567 */ 568 new_instr->block = block; 569 } 570 } 571 } 572 573 /* And lastly, insert branch/jump instructions to take us to 574 * the next block. Later we'll strip back out the branches 575 * that simply jump to next instruction. 576 */ 577 if (block->successors[1]) { 578 /* if/else, conditional branches to "then" or "else": */ 579 struct ir3_instruction *br; 580 unsigned delay = 6; 581 582 debug_assert(ctx->pred); 583 debug_assert(block->condition); 584 585 delay -= distance(ctx, ctx->pred, delay); 586 587 while (delay > 0) { 588 ir3_NOP(block); 589 delay--; 590 } 591 592 /* create "else" branch first (since "then" block should 593 * frequently/always end up being a fall-thru): 594 */ 595 br = ir3_BR(block); 596 br->cat0.inv = true; 597 br->cat0.target = block->successors[1]; 598 599 /* NOTE: we have to hard code delay of 6 above, since 600 * we want to insert the nop's before constructing the 601 * branch. Throw in an assert so we notice if this 602 * ever breaks on future generation: 603 */ 604 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6); 605 606 br = ir3_BR(block); 607 br->cat0.target = block->successors[0]; 608 609 } else if (block->successors[0]) { 610 /* otherwise unconditional jump to next block: */ 611 struct ir3_instruction *jmp; 612 613 jmp = ir3_JUMP(block); 614 jmp->cat0.target = block->successors[0]; 615 } 616 617 /* NOTE: if we kept track of the predecessors, we could do a better 618 * job w/ (jp) flags.. every node w/ > predecessor is a join point. 619 * Note that as we eliminate blocks which contain only an unconditional 620 * jump we probably need to propagate (jp) flag.. 621 */ 622 } 623 624 /* this is needed to ensure later RA stage succeeds: */ 625 static void 626 sched_insert_parallel_copies(struct ir3_block *block) 627 { 628 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 629 if (instr->opc == OPC_META_PHI) { 630 struct ir3_register *reg, *reg2; 631 foreach_src(reg, instr) { 632 struct ir3_instruction *src = reg->instr; 633 struct ir3_instruction *mov = NULL; 634 635 /* after CP we could end up w/ duplicate phi srcs: */ 636 foreach_src(reg2, instr) { 637 if (reg == reg2) 638 break; 639 /* reg2 is before reg1 so already an inserted mov: */ 640 else if (reg2->instr->regs[1]->instr == src) { 641 mov = reg2->instr; 642 break; 643 } 644 } 645 646 if (!mov) { 647 mov = ir3_MOV(src->block, src, TYPE_U32); 648 mov->regs[0]->flags |= IR3_REG_PHI_SRC; 649 mov->regs[0]->instr = instr; 650 } 651 652 reg->instr = mov; 653 } 654 } 655 } 656 } 657 658 int ir3_sched(struct ir3 *ir) 659 { 660 struct ir3_sched_ctx ctx = {0}; 661 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 662 sched_insert_parallel_copies(block); 663 } 664 ir3_clear_mark(ir); 665 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 666 sched_block(&ctx, block); 667 } 668 if (ctx.error) 669 return -1; 670 return 0; 671 } 672