Home | History | Annotate | Download | only in ir3
      1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
      2 
      3 /*
      4  * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org>
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice (including the next
     14  * paragraph) shall be included in all copies or substantial portions of the
     15  * Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     23  * SOFTWARE.
     24  *
     25  * Authors:
     26  *    Rob Clark <robclark (at) freedesktop.org>
     27  */
     28 
     29 
     30 #include "util/u_math.h"
     31 
     32 #include "ir3.h"
     33 
     34 /*
     35  * Instruction Scheduling:
     36  *
     37  * A recursive depth based scheduling algo.  Recursively find an eligible
     38  * instruction to schedule from the deepest instruction (recursing through
     39  * it's unscheduled src instructions).  Normally this would result in a
     40  * lot of re-traversal of the same instructions, so we cache results in
     41  * instr->data (and clear cached results that would be no longer valid
     42  * after scheduling an instruction).
     43  *
     44  * There are a few special cases that need to be handled, since sched
     45  * is currently independent of register allocation.  Usages of address
     46  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
     47  * if you have two pairs of instructions that write the same special
     48  * register and then read it, then those pairs cannot be interleaved.
     49  * To solve this, when we are in such a scheduling "critical section",
     50  * and we encounter a conflicting write to a special register, we try
     51  * to schedule any remaining instructions that use that value first.
     52  */
     53 
     54 struct ir3_sched_ctx {
     55 	struct ir3_block *block;           /* the current block */
     56 	struct list_head depth_list;       /* depth sorted unscheduled instrs */
     57 	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
     58 	struct ir3_instruction *addr;      /* current a0.x user, if any */
     59 	struct ir3_instruction *pred;      /* current p0.x user, if any */
     60 	bool error;
     61 };
     62 
     63 static bool is_sfu_or_mem(struct ir3_instruction *instr)
     64 {
     65 	return is_sfu(instr) || is_mem(instr);
     66 }
     67 
     68 #define NULL_INSTR ((void *)~0)
     69 
     70 static void
     71 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
     72 {
     73 	list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
     74 		if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
     75 			instr2->data = NULL;
     76 	}
     77 }
     78 
     79 static void
     80 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
     81 {
     82 	debug_assert(ctx->block == instr->block);
     83 
     84 	/* maybe there is a better way to handle this than just stuffing
     85 	 * a nop.. ideally we'd know about this constraint in the
     86 	 * scheduling and depth calculation..
     87 	 */
     88 	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
     89 		ir3_NOP(ctx->block);
     90 
     91 	/* remove from depth list:
     92 	 */
     93 	list_delinit(&instr->node);
     94 
     95 	if (writes_addr(instr)) {
     96 		debug_assert(ctx->addr == NULL);
     97 		ctx->addr = instr;
     98 	}
     99 
    100 	if (writes_pred(instr)) {
    101 		debug_assert(ctx->pred == NULL);
    102 		ctx->pred = instr;
    103 	}
    104 
    105 	instr->flags |= IR3_INSTR_MARK;
    106 
    107 	list_addtail(&instr->node, &instr->block->instr_list);
    108 	ctx->scheduled = instr;
    109 
    110 	if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
    111 		clear_cache(ctx, NULL);
    112 	} else {
    113 		/* invalidate only the necessary entries.. */
    114 		clear_cache(ctx, instr);
    115 	}
    116 }
    117 
    118 static struct ir3_instruction *
    119 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
    120 {
    121 	struct ir3_instruction *d = NULL;
    122 	unsigned i = 0, id = 0;
    123 
    124 	while ((i < nsrcs) && !(d = srcs[id = i]))
    125 		i++;
    126 
    127 	if (!d)
    128 		return NULL;
    129 
    130 	for (; i < nsrcs; i++)
    131 		if (srcs[i] && (srcs[i]->depth > d->depth))
    132 			d = srcs[id = i];
    133 
    134 	srcs[id] = NULL;
    135 
    136 	return d;
    137 }
    138 
    139 static unsigned
    140 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
    141 		unsigned maxd)
    142 {
    143 	struct list_head *instr_list = &ctx->block->instr_list;
    144 	unsigned d = 0;
    145 
    146 	list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
    147 		if ((n == instr) || (d >= maxd))
    148 			break;
    149 		if (is_alu(n) || is_flow(n))
    150 			d++;
    151 	}
    152 
    153 	return d;
    154 }
    155 
    156 /* calculate delay for specified src: */
    157 static unsigned
    158 delay_calc_srcn(struct ir3_sched_ctx *ctx,
    159 		struct ir3_instruction *assigner,
    160 		struct ir3_instruction *consumer, unsigned srcn)
    161 {
    162 	unsigned delay = 0;
    163 
    164 	if (is_meta(assigner)) {
    165 		struct ir3_instruction *src;
    166 		foreach_ssa_src(src, assigner) {
    167 			unsigned d;
    168 			if (src->block != assigner->block)
    169 				break;
    170 			d = delay_calc_srcn(ctx, src, consumer, srcn);
    171 			delay = MAX2(delay, d);
    172 		}
    173 	} else {
    174 		delay = ir3_delayslots(assigner, consumer, srcn);
    175 		delay -= distance(ctx, assigner, delay);
    176 	}
    177 
    178 	return delay;
    179 }
    180 
    181 /* calculate delay for instruction (maximum of delay for all srcs): */
    182 static unsigned
    183 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
    184 {
    185 	unsigned delay = 0;
    186 	struct ir3_instruction *src;
    187 
    188 	foreach_ssa_src_n(src, i, instr) {
    189 		unsigned d;
    190 		/* for array writes, no need to delay on previous write: */
    191 		if (i == 0)
    192 			continue;
    193 		if (src->block != instr->block)
    194 			continue;
    195 		d = delay_calc_srcn(ctx, src, instr, i);
    196 		delay = MAX2(delay, d);
    197 	}
    198 
    199 	return delay;
    200 }
    201 
    202 struct ir3_sched_notes {
    203 	/* there is at least one kill which could be scheduled, except
    204 	 * for unscheduled bary.f's:
    205 	 */
    206 	bool blocked_kill;
    207 	/* there is at least one instruction that could be scheduled,
    208 	 * except for conflicting address/predicate register usage:
    209 	 */
    210 	bool addr_conflict, pred_conflict;
    211 };
    212 
    213 static bool is_scheduled(struct ir3_instruction *instr)
    214 {
    215 	return !!(instr->flags & IR3_INSTR_MARK);
    216 }
    217 
    218 /* could an instruction be scheduled if specified ssa src was scheduled? */
    219 static bool
    220 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
    221 {
    222 	struct ir3_instruction *other_src;
    223 	foreach_ssa_src(other_src, instr) {
    224 		/* if dependency not scheduled, we aren't ready yet: */
    225 		if ((src != other_src) && !is_scheduled(other_src)) {
    226 			return false;
    227 		}
    228 	}
    229 	return true;
    230 }
    231 
    232 /* Check if instruction is ok to schedule.  Make sure it is not blocked
    233  * by use of addr/predicate register, etc.
    234  */
    235 static bool
    236 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
    237 		struct ir3_instruction *instr)
    238 {
    239 	/* For instructions that write address register we need to
    240 	 * make sure there is at least one instruction that uses the
    241 	 * addr value which is otherwise ready.
    242 	 *
    243 	 * TODO if any instructions use pred register and have other
    244 	 * src args, we would need to do the same for writes_pred()..
    245 	 */
    246 	if (writes_addr(instr)) {
    247 		struct ir3 *ir = instr->block->shader;
    248 		bool ready = false;
    249 		for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
    250 			struct ir3_instruction *indirect = ir->indirects[i];
    251 			if (!indirect)
    252 				continue;
    253 			if (indirect->address != instr)
    254 				continue;
    255 			ready = could_sched(indirect, instr);
    256 		}
    257 
    258 		/* nothing could be scheduled, so keep looking: */
    259 		if (!ready)
    260 			return false;
    261 	}
    262 
    263 	/* if this is a write to address/predicate register, and that
    264 	 * register is currently in use, we need to defer until it is
    265 	 * free:
    266 	 */
    267 	if (writes_addr(instr) && ctx->addr) {
    268 		debug_assert(ctx->addr != instr);
    269 		notes->addr_conflict = true;
    270 		return false;
    271 	}
    272 
    273 	if (writes_pred(instr) && ctx->pred) {
    274 		debug_assert(ctx->pred != instr);
    275 		notes->pred_conflict = true;
    276 		return false;
    277 	}
    278 
    279 	/* if the instruction is a kill, we need to ensure *every*
    280 	 * bary.f is scheduled.  The hw seems unhappy if the thread
    281 	 * gets killed before the end-input (ei) flag is hit.
    282 	 *
    283 	 * We could do this by adding each bary.f instruction as
    284 	 * virtual ssa src for the kill instruction.  But we have
    285 	 * fixed length instr->regs[].
    286 	 *
    287 	 * TODO this wouldn't be quite right if we had multiple
    288 	 * basic blocks, if any block was conditional.  We'd need
    289 	 * to schedule the bary.f's outside of any block which
    290 	 * was conditional that contained a kill.. I think..
    291 	 */
    292 	if (is_kill(instr)) {
    293 		struct ir3 *ir = instr->block->shader;
    294 
    295 		for (unsigned i = 0; i < ir->baryfs_count; i++) {
    296 			struct ir3_instruction *baryf = ir->baryfs[i];
    297 			if (baryf->flags & IR3_INSTR_UNUSED)
    298 				continue;
    299 			if (!is_scheduled(baryf)) {
    300 				notes->blocked_kill = true;
    301 				return false;
    302 			}
    303 		}
    304 	}
    305 
    306 	return true;
    307 }
    308 
    309 /* Find the best instruction to schedule from specified instruction or
    310  * recursively it's ssa sources.
    311  */
    312 static struct ir3_instruction *
    313 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
    314 		struct ir3_instruction *instr)
    315 {
    316 	struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
    317 	struct ir3_instruction *src;
    318 	unsigned nsrcs = 0;
    319 
    320 	if (is_scheduled(instr))
    321 		return NULL;
    322 
    323 	/* use instr->data to cache the results of recursing up the
    324 	 * instr src's.  Otherwise the recursive algo can scale quite
    325 	 * badly w/ shader size.  But this takes some care to clear
    326 	 * the cache appropriately when instructions are scheduled.
    327 	 */
    328 	if (instr->data) {
    329 		if (instr->data == NULL_INSTR)
    330 			return NULL;
    331 		return instr->data;
    332 	}
    333 
    334 	/* find unscheduled srcs: */
    335 	foreach_ssa_src(src, instr) {
    336 		if (!is_scheduled(src)) {
    337 			debug_assert(nsrcs < ARRAY_SIZE(srcs));
    338 			srcs[nsrcs++] = src;
    339 		}
    340 	}
    341 
    342 	/* if all our src's are already scheduled: */
    343 	if (nsrcs == 0) {
    344 		if (check_instr(ctx, notes, instr)) {
    345 			instr->data = instr;
    346 			return instr;
    347 		}
    348 		return NULL;
    349 	}
    350 
    351 	while ((src = deepest(srcs, nsrcs))) {
    352 		struct ir3_instruction *candidate;
    353 
    354 		candidate = find_instr_recursive(ctx, notes, src);
    355 		if (!candidate)
    356 			continue;
    357 
    358 		if (check_instr(ctx, notes, candidate)) {
    359 			instr->data = candidate;
    360 			return candidate;
    361 		}
    362 	}
    363 
    364 	instr->data = NULL_INSTR;
    365 	return NULL;
    366 }
    367 
    368 /* find instruction to schedule: */
    369 static struct ir3_instruction *
    370 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
    371 {
    372 	struct ir3_instruction *best_instr = NULL;
    373 	unsigned min_delay = ~0;
    374 
    375 	/* TODO we'd really rather use the list/array of block outputs.  But we
    376 	 * don't have such a thing.  Recursing *every* instruction in the list
    377 	 * will result in a lot of repeated traversal, since instructions will
    378 	 * get traversed both when they appear as ssa src to a later instruction
    379 	 * as well as where they appear in the depth_list.
    380 	 */
    381 	list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
    382 		struct ir3_instruction *candidate;
    383 		unsigned delay;
    384 
    385 		candidate = find_instr_recursive(ctx, notes, instr);
    386 		if (!candidate)
    387 			continue;
    388 
    389 		delay = delay_calc(ctx, candidate);
    390 		if (delay < min_delay) {
    391 			best_instr = candidate;
    392 			min_delay = delay;
    393 		}
    394 
    395 		if (min_delay == 0)
    396 			break;
    397 	}
    398 
    399 	return best_instr;
    400 }
    401 
    402 /* "spill" the address register by remapping any unscheduled
    403  * instructions which depend on the current address register
    404  * to a clone of the instruction which wrote the address reg.
    405  */
    406 static struct ir3_instruction *
    407 split_addr(struct ir3_sched_ctx *ctx)
    408 {
    409 	struct ir3 *ir;
    410 	struct ir3_instruction *new_addr = NULL;
    411 	unsigned i;
    412 
    413 	debug_assert(ctx->addr);
    414 
    415 	ir = ctx->addr->block->shader;
    416 
    417 	for (i = 0; i < ir->indirects_count; i++) {
    418 		struct ir3_instruction *indirect = ir->indirects[i];
    419 
    420 		if (!indirect)
    421 			continue;
    422 
    423 		/* skip instructions already scheduled: */
    424 		if (is_scheduled(indirect))
    425 			continue;
    426 
    427 		/* remap remaining instructions using current addr
    428 		 * to new addr:
    429 		 */
    430 		if (indirect->address == ctx->addr) {
    431 			if (!new_addr) {
    432 				new_addr = ir3_instr_clone(ctx->addr);
    433 				/* original addr is scheduled, but new one isn't: */
    434 				new_addr->flags &= ~IR3_INSTR_MARK;
    435 			}
    436 			ir3_instr_set_address(indirect, new_addr);
    437 		}
    438 	}
    439 
    440 	/* all remaining indirects remapped to new addr: */
    441 	ctx->addr = NULL;
    442 
    443 	return new_addr;
    444 }
    445 
    446 /* "spill" the predicate register by remapping any unscheduled
    447  * instructions which depend on the current predicate register
    448  * to a clone of the instruction which wrote the address reg.
    449  */
    450 static struct ir3_instruction *
    451 split_pred(struct ir3_sched_ctx *ctx)
    452 {
    453 	struct ir3 *ir;
    454 	struct ir3_instruction *new_pred = NULL;
    455 	unsigned i;
    456 
    457 	debug_assert(ctx->pred);
    458 
    459 	ir = ctx->pred->block->shader;
    460 
    461 	for (i = 0; i < ir->predicates_count; i++) {
    462 		struct ir3_instruction *predicated = ir->predicates[i];
    463 
    464 		/* skip instructions already scheduled: */
    465 		if (is_scheduled(predicated))
    466 			continue;
    467 
    468 		/* remap remaining instructions using current pred
    469 		 * to new pred:
    470 		 *
    471 		 * TODO is there ever a case when pred isn't first
    472 		 * (and only) src?
    473 		 */
    474 		if (ssa(predicated->regs[1]) == ctx->pred) {
    475 			if (!new_pred) {
    476 				new_pred = ir3_instr_clone(ctx->pred);
    477 				/* original pred is scheduled, but new one isn't: */
    478 				new_pred->flags &= ~IR3_INSTR_MARK;
    479 			}
    480 			predicated->regs[1]->instr = new_pred;
    481 		}
    482 	}
    483 
    484 	/* all remaining predicated remapped to new pred: */
    485 	ctx->pred = NULL;
    486 
    487 	return new_pred;
    488 }
    489 
    490 static void
    491 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
    492 {
    493 	struct list_head unscheduled_list;
    494 
    495 	ctx->block = block;
    496 
    497 	/* addr/pred writes are per-block: */
    498 	ctx->addr = NULL;
    499 	ctx->pred = NULL;
    500 
    501 	/* move all instructions to the unscheduled list, and
    502 	 * empty the block's instruction list (to which we will
    503 	 * be inserting).
    504 	 */
    505 	list_replace(&block->instr_list, &unscheduled_list);
    506 	list_inithead(&block->instr_list);
    507 	list_inithead(&ctx->depth_list);
    508 
    509 	/* first a pre-pass to schedule all meta:input/phi instructions
    510 	 * (which need to appear first so that RA knows the register is
    511 	 * occupied), and move remaining to depth sorted list:
    512 	 */
    513 	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
    514 		if ((instr->opc == OPC_META_INPUT) || (instr->opc == OPC_META_PHI)) {
    515 			schedule(ctx, instr);
    516 		} else {
    517 			ir3_insert_by_depth(instr, &ctx->depth_list);
    518 		}
    519 	}
    520 
    521 	while (!list_empty(&ctx->depth_list)) {
    522 		struct ir3_sched_notes notes = {0};
    523 		struct ir3_instruction *instr;
    524 
    525 		instr = find_eligible_instr(ctx, &notes);
    526 
    527 		if (instr) {
    528 			unsigned delay = delay_calc(ctx, instr);
    529 
    530 			/* and if we run out of instructions that can be scheduled,
    531 			 * then it is time for nop's:
    532 			 */
    533 			debug_assert(delay <= 6);
    534 			while (delay > 0) {
    535 				ir3_NOP(block);
    536 				delay--;
    537 			}
    538 
    539 			schedule(ctx, instr);
    540 		} else {
    541 			struct ir3_instruction *new_instr = NULL;
    542 
    543 			/* nothing available to schedule.. if we are blocked on
    544 			 * address/predicate register conflict, then break the
    545 			 * deadlock by cloning the instruction that wrote that
    546 			 * reg:
    547 			 */
    548 			if (notes.addr_conflict) {
    549 				new_instr = split_addr(ctx);
    550 			} else if (notes.pred_conflict) {
    551 				new_instr = split_pred(ctx);
    552 			} else {
    553 				debug_assert(0);
    554 				ctx->error = true;
    555 				return;
    556 			}
    557 
    558 			if (new_instr) {
    559 				/* clearing current addr/pred can change what is
    560 				 * available to schedule, so clear cache..
    561 				 */
    562 				clear_cache(ctx, NULL);
    563 
    564 				ir3_insert_by_depth(new_instr, &ctx->depth_list);
    565 				/* the original instr that wrote addr/pred may have
    566 				 * originated from a different block:
    567 				 */
    568 				new_instr->block = block;
    569 			}
    570 		}
    571 	}
    572 
    573 	/* And lastly, insert branch/jump instructions to take us to
    574 	 * the next block.  Later we'll strip back out the branches
    575 	 * that simply jump to next instruction.
    576 	 */
    577 	if (block->successors[1]) {
    578 		/* if/else, conditional branches to "then" or "else": */
    579 		struct ir3_instruction *br;
    580 		unsigned delay = 6;
    581 
    582 		debug_assert(ctx->pred);
    583 		debug_assert(block->condition);
    584 
    585 		delay -= distance(ctx, ctx->pred, delay);
    586 
    587 		while (delay > 0) {
    588 			ir3_NOP(block);
    589 			delay--;
    590 		}
    591 
    592 		/* create "else" branch first (since "then" block should
    593 		 * frequently/always end up being a fall-thru):
    594 		 */
    595 		br = ir3_BR(block);
    596 		br->cat0.inv = true;
    597 		br->cat0.target = block->successors[1];
    598 
    599 		/* NOTE: we have to hard code delay of 6 above, since
    600 		 * we want to insert the nop's before constructing the
    601 		 * branch.  Throw in an assert so we notice if this
    602 		 * ever breaks on future generation:
    603 		 */
    604 		debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
    605 
    606 		br = ir3_BR(block);
    607 		br->cat0.target = block->successors[0];
    608 
    609 	} else if (block->successors[0]) {
    610 		/* otherwise unconditional jump to next block: */
    611 		struct ir3_instruction *jmp;
    612 
    613 		jmp = ir3_JUMP(block);
    614 		jmp->cat0.target = block->successors[0];
    615 	}
    616 
    617 	/* NOTE: if we kept track of the predecessors, we could do a better
    618 	 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
    619 	 * Note that as we eliminate blocks which contain only an unconditional
    620 	 * jump we probably need to propagate (jp) flag..
    621 	 */
    622 }
    623 
    624 /* this is needed to ensure later RA stage succeeds: */
    625 static void
    626 sched_insert_parallel_copies(struct ir3_block *block)
    627 {
    628 	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
    629 		if (instr->opc == OPC_META_PHI) {
    630 			struct ir3_register *reg, *reg2;
    631 			foreach_src(reg, instr) {
    632 				struct ir3_instruction *src = reg->instr;
    633 				struct ir3_instruction *mov = NULL;
    634 
    635 				/* after CP we could end up w/ duplicate phi srcs: */
    636 				foreach_src(reg2, instr) {
    637 					if (reg == reg2)
    638 						break;
    639 					/* reg2 is before reg1 so already an inserted mov: */
    640 					else if (reg2->instr->regs[1]->instr == src) {
    641 						mov = reg2->instr;
    642 						break;
    643 					}
    644 				}
    645 
    646 				if (!mov) {
    647 					mov = ir3_MOV(src->block, src, TYPE_U32);
    648 					mov->regs[0]->flags |= IR3_REG_PHI_SRC;
    649 					mov->regs[0]->instr = instr;
    650 				}
    651 
    652 				reg->instr = mov;
    653 			}
    654 		}
    655 	}
    656 }
    657 
    658 int ir3_sched(struct ir3 *ir)
    659 {
    660 	struct ir3_sched_ctx ctx = {0};
    661 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    662 		sched_insert_parallel_copies(block);
    663 	}
    664 	ir3_clear_mark(ir);
    665 	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
    666 		sched_block(&ctx, block);
    667 	}
    668 	if (ctx.error)
    669 		return -1;
    670 	return 0;
    671 }
    672