1 /* 2 * Copyright 2013 Vadim Girlin <vadimgirlin (at) gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Vadim Girlin 25 */ 26 27 #define PSC_DEBUG 0 28 29 #if PSC_DEBUG 30 #define PSC_DUMP(a) do { a } while (0) 31 #else 32 #define PSC_DUMP(a) 33 #endif 34 35 #include "sb_bc.h" 36 #include "sb_shader.h" 37 #include "sb_pass.h" 38 #include "sb_sched.h" 39 #include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1 40 41 namespace r600_sb { 42 43 rp_kcache_tracker::rp_kcache_tracker(shader &sh) : rp(), uc(), 44 // FIXME: for now we'll use "two const pairs" limit for r600, same as 45 // for other chips, otherwise additional check in alu_group_tracker is 46 // required to make sure that all 4 consts in the group fit into 2 47 // kcache sets 48 sel_count(2) {} 49 50 bool rp_kcache_tracker::try_reserve(sel_chan r) { 51 unsigned sel = kc_sel(r); 52 53 for (unsigned i = 0; i < sel_count; ++i) { 54 if (rp[i] == 0) { 55 rp[i] = sel; 56 ++uc[i]; 57 return true; 58 } 59 if (rp[i] == sel) { 60 ++uc[i]; 61 return true; 62 } 63 } 64 return false; 65 } 66 67 bool rp_kcache_tracker::try_reserve(node* n) { 68 bool need_unreserve = false; 69 vvec::iterator I(n->src.begin()), E(n->src.end()); 70 71 for (; I != E; ++I) { 72 value *v = *I; 73 if (v->is_kcache()) { 74 if (!try_reserve(v->select)) 75 break; 76 else 77 need_unreserve = true; 78 } 79 } 80 if (I == E) 81 return true; 82 83 if (need_unreserve && I != n->src.begin()) { 84 do { 85 --I; 86 value *v =*I; 87 if (v->is_kcache()) 88 unreserve(v->select); 89 } while (I != n->src.begin()); 90 } 91 return false; 92 } 93 94 inline 95 void rp_kcache_tracker::unreserve(node* n) { 96 vvec::iterator I(n->src.begin()), E(n->src.end()); 97 for (; I != E; ++I) { 98 value *v = *I; 99 if (v->is_kcache()) 100 unreserve(v->select); 101 } 102 } 103 104 void rp_kcache_tracker::unreserve(sel_chan r) { 105 unsigned sel = kc_sel(r); 106 107 for (unsigned i = 0; i < sel_count; ++i) 108 if (rp[i] == sel) { 109 if (--uc[i] == 0) 110 rp[i] = 0; 111 return; 112 } 113 assert(0); 114 return; 115 } 116 117 bool literal_tracker::try_reserve(alu_node* n) { 118 bool need_unreserve = false; 119 120 vvec::iterator I(n->src.begin()), E(n->src.end()); 121 122 for (; I != E; ++I) { 123 value *v = *I; 124 if (v->is_literal()) { 125 if (!try_reserve(v->literal_value)) 126 break; 127 else 128 need_unreserve = true; 129 } 130 } 131 if (I == E) 132 return true; 133 134 if (need_unreserve && I != n->src.begin()) { 135 do { 136 --I; 137 value *v =*I; 138 if (v->is_literal()) 139 unreserve(v->literal_value); 140 } while (I != n->src.begin()); 141 } 142 return false; 143 } 144 145 void literal_tracker::unreserve(alu_node* n) { 146 unsigned nsrc = n->bc.op_ptr->src_count, i; 147 148 for (i = 0; i < nsrc; ++i) { 149 value *v = n->src[i]; 150 if (v->is_literal()) 151 unreserve(v->literal_value); 152 } 153 } 154 155 bool literal_tracker::try_reserve(literal l) { 156 157 PSC_DUMP( sblog << "literal reserve " << l.u << " " << l.f << "\n"; ); 158 159 for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) { 160 if (lt[i] == 0) { 161 lt[i] = l; 162 ++uc[i]; 163 PSC_DUMP( sblog << " reserved new uc = " << uc[i] << "\n"; ); 164 return true; 165 } else if (lt[i] == l) { 166 ++uc[i]; 167 PSC_DUMP( sblog << " reserved uc = " << uc[i] << "\n"; ); 168 return true; 169 } 170 } 171 PSC_DUMP( sblog << " failed to reserve literal\n"; ); 172 return false; 173 } 174 175 void literal_tracker::unreserve(literal l) { 176 177 PSC_DUMP( sblog << "literal unreserve " << l.u << " " << l.f << "\n"; ); 178 179 for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) { 180 if (lt[i] == l) { 181 if (--uc[i] == 0) 182 lt[i] = 0; 183 return; 184 } 185 } 186 assert(0); 187 return; 188 } 189 190 static inline unsigned bs_cycle_vector(unsigned bs, unsigned src) { 191 static const unsigned swz[VEC_NUM][3] = { 192 {0, 1, 2}, {0, 2, 1}, {1, 2, 0}, {1, 0, 2}, {2, 0, 1}, {2, 1, 0} 193 }; 194 assert(bs < VEC_NUM && src < 3); 195 return swz[bs][src]; 196 } 197 198 static inline unsigned bs_cycle_scalar(unsigned bs, unsigned src) { 199 static const unsigned swz[SCL_NUM][3] = { 200 {2, 1, 0}, {1, 2, 2}, {2, 1, 2}, {2, 2, 1} 201 }; 202 203 if (bs >= SCL_NUM || src >= 3) { 204 // this prevents gcc warning "array subscript is above array bounds" 205 // AFAICS we should never hit this path 206 abort(); 207 } 208 return swz[bs][src]; 209 } 210 211 static inline unsigned bs_cycle(bool trans, unsigned bs, unsigned src) { 212 return trans ? bs_cycle_scalar(bs, src) : bs_cycle_vector(bs, src); 213 } 214 215 inline 216 bool rp_gpr_tracker::try_reserve(unsigned cycle, unsigned sel, unsigned chan) { 217 ++sel; 218 if (rp[cycle][chan] == 0) { 219 rp[cycle][chan] = sel; 220 ++uc[cycle][chan]; 221 return true; 222 } else if (rp[cycle][chan] == sel) { 223 ++uc[cycle][chan]; 224 return true; 225 } 226 return false; 227 } 228 229 inline 230 void rp_gpr_tracker::unreserve(alu_node* n) { 231 unsigned nsrc = n->bc.op_ptr->src_count, i; 232 unsigned trans = n->bc.slot == SLOT_TRANS; 233 unsigned bs = n->bc.bank_swizzle; 234 unsigned opt = !trans 235 && n->bc.src[0].sel == n->bc.src[1].sel 236 && n->bc.src[0].chan == n->bc.src[1].chan; 237 238 for (i = 0; i < nsrc; ++i) { 239 value *v = n->src[i]; 240 if (v->is_readonly() || v->is_undef()) 241 continue; 242 if (i == 1 && opt) 243 continue; 244 unsigned cycle = bs_cycle(trans, bs, i); 245 unreserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan); 246 } 247 } 248 249 inline 250 void rp_gpr_tracker::unreserve(unsigned cycle, unsigned sel, unsigned chan) { 251 ++sel; 252 assert(rp[cycle][chan] == sel && uc[cycle][chan]); 253 if (--uc[cycle][chan] == 0) 254 rp[cycle][chan] = 0; 255 } 256 257 inline 258 bool rp_gpr_tracker::try_reserve(alu_node* n) { 259 unsigned nsrc = n->bc.op_ptr->src_count, i; 260 unsigned trans = n->bc.slot == SLOT_TRANS; 261 unsigned bs = n->bc.bank_swizzle; 262 unsigned opt = !trans && nsrc >= 2 && 263 n->src[0] == n->src[1]; 264 265 bool need_unreserve = false; 266 unsigned const_count = 0, min_gpr_cycle = 3; 267 268 for (i = 0; i < nsrc; ++i) { 269 value *v = n->src[i]; 270 if (v->is_readonly() || v->is_undef()) { 271 const_count++; 272 if (trans && const_count == 3) 273 break; 274 } else { 275 if (i == 1 && opt) 276 continue; 277 278 unsigned cycle = bs_cycle(trans, bs, i); 279 280 if (trans && cycle < min_gpr_cycle) 281 min_gpr_cycle = cycle; 282 283 if (const_count && cycle < const_count && trans) 284 break; 285 286 if (!try_reserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan)) 287 break; 288 else 289 need_unreserve = true; 290 } 291 } 292 293 if ((i == nsrc) && (min_gpr_cycle + 1 > const_count)) 294 return true; 295 296 if (need_unreserve && i--) { 297 do { 298 value *v = n->src[i]; 299 if (!v->is_readonly() && !v->is_undef()) { 300 if (i == 1 && opt) 301 continue; 302 unreserve(bs_cycle(trans, bs, i), n->bc.src[i].sel, 303 n->bc.src[i].chan); 304 } 305 } while (i--); 306 } 307 return false; 308 } 309 310 alu_group_tracker::alu_group_tracker(shader &sh) 311 : sh(sh), kc(sh), 312 gpr(), lt(), slots(), 313 max_slots(sh.get_ctx().is_cayman() ? 4 : 5), 314 has_mova(), uses_ar(), has_predset(), has_kill(), 315 updates_exec_mask(), chan_count(), interp_param(), next_id() { 316 317 available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F; 318 } 319 320 inline 321 sel_chan alu_group_tracker::get_value_id(value* v) { 322 unsigned &id = vmap[v]; 323 if (!id) 324 id = ++next_id; 325 return sel_chan(id, v->get_final_chan()); 326 } 327 328 inline 329 void alu_group_tracker::assign_slot(unsigned slot, alu_node* n) { 330 update_flags(n); 331 slots[slot] = n; 332 available_slots &= ~(1 << slot); 333 334 unsigned param = n->interp_param(); 335 336 if (param) { 337 assert(!interp_param || interp_param == param); 338 interp_param = param; 339 } 340 } 341 342 343 void alu_group_tracker::discard_all_slots(container_node &removed_nodes) { 344 PSC_DUMP( sblog << "agt::discard_all_slots\n"; ); 345 discard_slots(~available_slots & ((1 << max_slots) - 1), removed_nodes); 346 } 347 348 void alu_group_tracker::discard_slots(unsigned slot_mask, 349 container_node &removed_nodes) { 350 351 PSC_DUMP( 352 sblog << "discard_slots : packed_ops : " 353 << (unsigned)packed_ops.size() << "\n"; 354 ); 355 356 for (node_vec::iterator N, I = packed_ops.begin(); 357 I != packed_ops.end(); I = N) { 358 N = I; ++N; 359 360 alu_packed_node *n = static_cast<alu_packed_node*>(*I); 361 unsigned pslots = n->get_slot_mask(); 362 363 PSC_DUMP( 364 sblog << "discard_slots : packed slot_mask : " << pslots << "\n"; 365 ); 366 367 if (pslots & slot_mask) { 368 369 PSC_DUMP( 370 sblog << "discard_slots : discarding packed...\n"; 371 ); 372 373 removed_nodes.push_back(n); 374 slot_mask &= ~pslots; 375 N = packed_ops.erase(I); 376 available_slots |= pslots; 377 for (unsigned k = 0; k < max_slots; ++k) { 378 if (pslots & (1 << k)) 379 slots[k] = NULL; 380 } 381 } 382 } 383 384 for (unsigned slot = 0; slot < max_slots; ++slot) { 385 unsigned slot_bit = 1 << slot; 386 387 if (slot_mask & slot_bit) { 388 assert(!(available_slots & slot_bit)); 389 assert(slots[slot]); 390 391 assert(!(slots[slot]->bc.slot_flags & AF_4SLOT)); 392 393 PSC_DUMP( 394 sblog << "discarding slot " << slot << " : "; 395 dump::dump_op(slots[slot]); 396 sblog << "\n"; 397 ); 398 399 removed_nodes.push_back(slots[slot]); 400 slots[slot] = NULL; 401 available_slots |= slot_bit; 402 } 403 } 404 405 alu_node *t = slots[4]; 406 if (t && (t->bc.slot_flags & AF_V)) { 407 unsigned chan = t->bc.dst_chan; 408 if (!slots[chan]) { 409 PSC_DUMP( 410 sblog << "moving "; 411 dump::dump_op(t); 412 sblog << " from trans slot to free slot " << chan << "\n"; 413 ); 414 415 slots[chan] = t; 416 slots[4] = NULL; 417 t->bc.slot = chan; 418 } 419 } 420 421 reinit(); 422 } 423 424 alu_group_node* alu_group_tracker::emit() { 425 426 alu_group_node *g = sh.create_alu_group(); 427 428 lt.init_group_literals(g); 429 430 for (unsigned i = 0; i < max_slots; ++i) { 431 alu_node *n = slots[i]; 432 if (n) { 433 g->push_back(n); 434 } 435 } 436 return g; 437 } 438 439 bool alu_group_tracker::try_reserve(alu_node* n) { 440 unsigned nsrc = n->bc.op_ptr->src_count; 441 unsigned slot = n->bc.slot; 442 bool trans = slot == 4; 443 444 if (slots[slot]) 445 return false; 446 447 unsigned flags = n->bc.op_ptr->flags; 448 449 unsigned param = n->interp_param(); 450 451 if (param && interp_param && interp_param != param) 452 return false; 453 454 if ((flags & AF_KILL) && has_predset) 455 return false; 456 if ((flags & AF_ANY_PRED) && (has_kill || has_predset)) 457 return false; 458 if ((flags & AF_MOVA) && (has_mova || uses_ar)) 459 return false; 460 461 if (n->uses_ar() && has_mova) 462 return false; 463 464 for (unsigned i = 0; i < nsrc; ++i) { 465 466 unsigned last_id = next_id; 467 468 value *v = n->src[i]; 469 if (!v->is_any_gpr() && !v->is_rel()) 470 continue; 471 sel_chan vid = get_value_id(n->src[i]); 472 473 if (vid > last_id && chan_count[vid.chan()] == 3) { 474 return false; 475 } 476 477 n->bc.src[i].sel = vid.sel(); 478 n->bc.src[i].chan = vid.chan(); 479 } 480 481 if (!lt.try_reserve(n)) 482 return false; 483 484 if (!kc.try_reserve(n)) { 485 lt.unreserve(n); 486 return false; 487 } 488 489 unsigned fbs = n->forced_bank_swizzle(); 490 491 n->bc.bank_swizzle = 0; 492 493 if (!trans && fbs) 494 n->bc.bank_swizzle = VEC_210; 495 496 if (gpr.try_reserve(n)) { 497 assign_slot(slot, n); 498 return true; 499 } 500 501 if (!fbs) { 502 unsigned swz_num = trans ? SCL_NUM : VEC_NUM; 503 for (unsigned bs = 0; bs < swz_num; ++bs) { 504 n->bc.bank_swizzle = bs; 505 if (gpr.try_reserve(n)) { 506 assign_slot(slot, n); 507 return true; 508 } 509 } 510 } 511 512 gpr.reset(); 513 514 slots[slot] = n; 515 unsigned forced_swz_slots = 0; 516 int first_slot = ~0, first_nf = ~0, last_slot = ~0; 517 unsigned save_bs[5]; 518 519 for (unsigned i = 0; i < max_slots; ++i) { 520 alu_node *a = slots[i]; 521 if (a) { 522 if (first_slot == ~0) 523 first_slot = i; 524 last_slot = i; 525 save_bs[i] = a->bc.bank_swizzle; 526 if (a->forced_bank_swizzle()) { 527 assert(i != SLOT_TRANS); 528 forced_swz_slots |= (1 << i); 529 a->bc.bank_swizzle = VEC_210; 530 if (!gpr.try_reserve(a)) 531 assert(!"internal reservation error"); 532 } else { 533 if (first_nf == ~0) 534 first_nf = i; 535 536 a->bc.bank_swizzle = 0; 537 } 538 } 539 } 540 541 if (first_nf == ~0) { 542 assign_slot(slot, n); 543 return true; 544 } 545 546 assert(first_slot != ~0 && last_slot != ~0); 547 548 // silence "array subscript is above array bounds" with gcc 4.8 549 if (last_slot >= 5) 550 abort(); 551 552 int i = first_nf; 553 alu_node *a = slots[i]; 554 bool backtrack = false; 555 556 while (1) { 557 558 PSC_DUMP( 559 sblog << " bs: trying s" << i << " bs:" << a->bc.bank_swizzle 560 << " bt:" << backtrack << "\n"; 561 ); 562 563 if (!backtrack && gpr.try_reserve(a)) { 564 PSC_DUMP( 565 sblog << " bs: reserved s" << i << " bs:" << a->bc.bank_swizzle 566 << "\n"; 567 ); 568 569 while ((++i <= last_slot) && !slots[i]); 570 if (i <= last_slot) 571 a = slots[i]; 572 else 573 break; 574 } else { 575 bool itrans = i == SLOT_TRANS; 576 unsigned max_swz = itrans ? SCL_221 : VEC_210; 577 578 if (a->bc.bank_swizzle < max_swz) { 579 ++a->bc.bank_swizzle; 580 581 PSC_DUMP( 582 sblog << " bs: inc s" << i << " bs:" << a->bc.bank_swizzle 583 << "\n"; 584 ); 585 586 } else { 587 588 a->bc.bank_swizzle = 0; 589 while ((--i >= first_nf) && !slots[i]); 590 if (i < first_nf) 591 break; 592 a = slots[i]; 593 PSC_DUMP( 594 sblog << " bs: unreserve s" << i << " bs:" << a->bc.bank_swizzle 595 << "\n"; 596 ); 597 gpr.unreserve(a); 598 backtrack = true; 599 600 continue; 601 } 602 } 603 backtrack = false; 604 } 605 606 if (i == last_slot + 1) { 607 assign_slot(slot, n); 608 return true; 609 } 610 611 // reservation failed, restore previous state 612 slots[slot] = NULL; 613 gpr.reset(); 614 for (unsigned i = 0; i < max_slots; ++i) { 615 alu_node *a = slots[i]; 616 if (a) { 617 a->bc.bank_swizzle = save_bs[i]; 618 bool b = gpr.try_reserve(a); 619 assert(b); 620 } 621 } 622 623 kc.unreserve(n); 624 lt.unreserve(n); 625 return false; 626 } 627 628 bool alu_group_tracker::try_reserve(alu_packed_node* p) { 629 bool need_unreserve = false; 630 node_iterator I(p->begin()), E(p->end()); 631 632 for (; I != E; ++I) { 633 alu_node *n = static_cast<alu_node*>(*I); 634 if (!try_reserve(n)) 635 break; 636 else 637 need_unreserve = true; 638 } 639 640 if (I == E) { 641 packed_ops.push_back(p); 642 return true; 643 } 644 645 if (need_unreserve) { 646 while (--I != E) { 647 alu_node *n = static_cast<alu_node*>(*I); 648 slots[n->bc.slot] = NULL; 649 } 650 reinit(); 651 } 652 return false; 653 } 654 655 void alu_group_tracker::reinit() { 656 alu_node * s[5]; 657 memcpy(s, slots, sizeof(slots)); 658 659 reset(true); 660 661 for (int i = max_slots - 1; i >= 0; --i) { 662 if (s[i] && !try_reserve(s[i])) { 663 sblog << "alu_group_tracker: reinit error on slot " << i << "\n"; 664 for (unsigned i = 0; i < max_slots; ++i) { 665 sblog << " slot " << i << " : "; 666 if (s[i]) 667 dump::dump_op(s[i]); 668 669 sblog << "\n"; 670 } 671 assert(!"alu_group_tracker: reinit error"); 672 } 673 } 674 } 675 676 void alu_group_tracker::reset(bool keep_packed) { 677 kc.reset(); 678 gpr.reset(); 679 lt.reset(); 680 memset(slots, 0, sizeof(slots)); 681 vmap.clear(); 682 next_id = 0; 683 has_mova = false; 684 uses_ar = false; 685 has_predset = false; 686 has_kill = false; 687 updates_exec_mask = false; 688 available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F; 689 interp_param = 0; 690 691 chan_count[0] = 0; 692 chan_count[1] = 0; 693 chan_count[2] = 0; 694 chan_count[3] = 0; 695 696 if (!keep_packed) 697 packed_ops.clear(); 698 } 699 700 void alu_group_tracker::update_flags(alu_node* n) { 701 unsigned flags = n->bc.op_ptr->flags; 702 has_kill |= (flags & AF_KILL); 703 has_mova |= (flags & AF_MOVA); 704 has_predset |= (flags & AF_ANY_PRED); 705 uses_ar |= n->uses_ar(); 706 707 if (flags & AF_ANY_PRED) { 708 if (n->dst[2] != NULL) 709 updates_exec_mask = true; 710 } 711 } 712 713 int post_scheduler::run() { 714 run_on(sh.root); 715 return 0; 716 } 717 718 void post_scheduler::run_on(container_node* n) { 719 720 for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) { 721 if (I->is_container()) { 722 if (I->subtype == NST_BB) { 723 bb_node* bb = static_cast<bb_node*>(*I); 724 schedule_bb(bb); 725 } else { 726 run_on(static_cast<container_node*>(*I)); 727 } 728 } 729 } 730 } 731 732 void post_scheduler::init_uc_val(container_node *c, value *v) { 733 node *d = v->any_def(); 734 if (d && d->parent == c) 735 ++ucm[d]; 736 } 737 738 void post_scheduler::init_uc_vec(container_node *c, vvec &vv, bool src) { 739 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { 740 value *v = *I; 741 if (!v || v->is_readonly()) 742 continue; 743 744 if (v->is_rel()) { 745 init_uc_val(c, v->rel); 746 init_uc_vec(c, v->muse, true); 747 } if (src) { 748 init_uc_val(c, v); 749 } 750 } 751 } 752 753 unsigned post_scheduler::init_ucm(container_node *c, node *n) { 754 init_uc_vec(c, n->src, true); 755 init_uc_vec(c, n->dst, false); 756 757 uc_map::iterator F = ucm.find(n); 758 return F == ucm.end() ? 0 : F->second; 759 } 760 761 void post_scheduler::schedule_bb(bb_node* bb) { 762 PSC_DUMP( 763 sblog << "scheduling BB " << bb->id << "\n"; 764 if (!pending.empty()) 765 dump::dump_op_list(&pending); 766 ); 767 768 assert(pending.empty()); 769 assert(bb_pending.empty()); 770 assert(ready.empty()); 771 772 bb_pending.append_from(bb); 773 cur_bb = bb; 774 775 node *n; 776 777 while ((n = bb_pending.back())) { 778 779 PSC_DUMP( 780 sblog << "post_sched_bb "; 781 dump::dump_op(n); 782 sblog << "\n"; 783 ); 784 785 // May require emitting ALU ops to load index registers 786 if (n->is_fetch_clause()) { 787 n->remove(); 788 process_fetch(static_cast<container_node *>(n)); 789 continue; 790 } 791 792 if (n->is_alu_clause()) { 793 n->remove(); 794 process_alu(static_cast<container_node*>(n)); 795 continue; 796 } 797 798 n->remove(); 799 bb->push_front(n); 800 } 801 802 this->cur_bb = NULL; 803 } 804 805 void post_scheduler::init_regmap() { 806 807 regmap.clear(); 808 809 PSC_DUMP( 810 sblog << "init_regmap: live: "; 811 dump::dump_set(sh, live); 812 sblog << "\n"; 813 ); 814 815 for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) { 816 value *v = *I; 817 assert(v); 818 if (!v->is_sgpr() || !v->is_prealloc()) 819 continue; 820 821 sel_chan r = v->gpr; 822 823 PSC_DUMP( 824 sblog << "init_regmap: " << r << " <= "; 825 dump::dump_val(v); 826 sblog << "\n"; 827 ); 828 829 assert(r); 830 regmap[r] = v; 831 } 832 } 833 834 static alu_node *create_set_idx(shader &sh, unsigned ar_idx) { 835 alu_node *a = sh.create_alu(); 836 837 assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1); 838 if (ar_idx == V_SQ_CF_INDEX_0) 839 a->bc.set_op(ALU_OP0_SET_CF_IDX0); 840 else 841 a->bc.set_op(ALU_OP0_SET_CF_IDX1); 842 a->bc.slot = SLOT_X; 843 a->dst.resize(1); // Dummy needed for recolor 844 845 PSC_DUMP( 846 sblog << "created IDX load: "; 847 dump::dump_op(a); 848 sblog << "\n"; 849 ); 850 851 return a; 852 } 853 854 void post_scheduler::load_index_register(value *v, unsigned ar_idx) 855 { 856 alu.reset(); 857 858 if (!sh.get_ctx().is_cayman()) { 859 // Evergreen has to first load address register, then use CF_SET_IDX0/1 860 alu_group_tracker &rt = alu.grp(); 861 alu_node *set_idx = create_set_idx(sh, ar_idx); 862 if (!rt.try_reserve(set_idx)) { 863 sblog << "can't emit SET_CF_IDX"; 864 dump::dump_op(set_idx); 865 sblog << "\n"; 866 } 867 process_group(); 868 869 if (!alu.check_clause_limits()) { 870 // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 871 } 872 alu.emit_group(); 873 } 874 875 alu_group_tracker &rt = alu.grp(); 876 alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y); 877 878 if (!rt.try_reserve(a)) { 879 sblog << "can't emit AR load : "; 880 dump::dump_op(a); 881 sblog << "\n"; 882 } 883 884 process_group(); 885 886 if (!alu.check_clause_limits()) { 887 // Can't happen since clause only contains MOVA/CF_SET_IDX0/1 888 } 889 890 alu.emit_group(); 891 alu.emit_clause(cur_bb); 892 } 893 894 void post_scheduler::process_fetch(container_node *c) { 895 if (c->empty()) 896 return; 897 898 for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) { 899 N = I; 900 ++N; 901 902 node *n = *I; 903 904 fetch_node *f = static_cast<fetch_node*>(n); 905 906 PSC_DUMP( 907 sblog << "process_tex "; 908 dump::dump_op(n); 909 sblog << " "; 910 ); 911 912 // TODO: If same values used can avoid reloading index register 913 if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE || 914 f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) { 915 unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ? 916 f->bc.sampler_index_mode : f->bc.resource_index_mode; 917 918 // Currently require prior opt passes to use one TEX per indexed op 919 assert(f->parent->count() == 1); 920 921 value *v = f->src.back(); // Last src is index offset 922 assert(v); 923 924 cur_bb->push_front(c); 925 926 load_index_register(v, index_mode); 927 f->src.pop_back(); // Don't need index value any more 928 929 return; 930 } 931 } 932 933 cur_bb->push_front(c); 934 } 935 936 void post_scheduler::process_alu(container_node *c) { 937 938 if (c->empty()) 939 return; 940 941 ucm.clear(); 942 alu.reset(); 943 944 live = c->live_after; 945 946 init_globals(c->live_after, true); 947 init_globals(c->live_before, true); 948 949 init_regmap(); 950 951 update_local_interferences(); 952 953 for (node_riterator N, I = c->rbegin(), E = c->rend(); I != E; I = N) { 954 N = I; 955 ++N; 956 957 node *n = *I; 958 unsigned uc = init_ucm(c, n); 959 960 PSC_DUMP( 961 sblog << "process_alu uc=" << uc << " "; 962 dump::dump_op(n); 963 sblog << " "; 964 ); 965 966 if (uc) { 967 n->remove(); 968 969 pending.push_back(n); 970 PSC_DUMP( sblog << "pending\n"; ); 971 } else { 972 release_op(n); 973 } 974 } 975 976 schedule_alu(c); 977 } 978 979 void post_scheduler::update_local_interferences() { 980 981 PSC_DUMP( 982 sblog << "update_local_interferences : "; 983 dump::dump_set(sh, live); 984 sblog << "\n"; 985 ); 986 987 988 for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) { 989 value *v = *I; 990 if (v->is_prealloc()) 991 continue; 992 993 v->interferences.add_set(live); 994 } 995 } 996 997 void post_scheduler::update_live_src_vec(vvec &vv, val_set *born, bool src) { 998 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { 999 value *v = *I; 1000 1001 if (!v) 1002 continue; 1003 1004 if (src && v->is_any_gpr()) { 1005 if (live.add_val(v)) { 1006 if (!v->is_prealloc()) { 1007 if (!cleared_interf.contains(v)) { 1008 PSC_DUMP( 1009 sblog << "clearing interferences for " << *v << "\n"; 1010 ); 1011 v->interferences.clear(); 1012 cleared_interf.add_val(v); 1013 } 1014 } 1015 if (born) 1016 born->add_val(v); 1017 } 1018 } else if (v->is_rel()) { 1019 if (!v->rel->is_any_gpr()) 1020 live.add_val(v->rel); 1021 update_live_src_vec(v->muse, born, true); 1022 } 1023 } 1024 } 1025 1026 void post_scheduler::update_live_dst_vec(vvec &vv) { 1027 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { 1028 value *v = *I; 1029 if (!v) 1030 continue; 1031 1032 if (v->is_rel()) { 1033 update_live_dst_vec(v->mdef); 1034 } else if (v->is_any_gpr()) { 1035 if (!live.remove_val(v)) { 1036 PSC_DUMP( 1037 sblog << "failed to remove "; 1038 dump::dump_val(v); 1039 sblog << " from live : "; 1040 dump::dump_set(sh, live); 1041 sblog << "\n"; 1042 ); 1043 } 1044 } 1045 } 1046 } 1047 1048 void post_scheduler::update_live(node *n, val_set *born) { 1049 update_live_dst_vec(n->dst); 1050 update_live_src_vec(n->src, born, true); 1051 update_live_src_vec(n->dst, born, false); 1052 } 1053 1054 void post_scheduler::process_group() { 1055 alu_group_tracker &rt = alu.grp(); 1056 1057 val_set vals_born; 1058 1059 recolor_locals(); 1060 1061 PSC_DUMP( 1062 sblog << "process_group: live_before : "; 1063 dump::dump_set(sh, live); 1064 sblog << "\n"; 1065 ); 1066 1067 for (unsigned s = 0; s < ctx.num_slots; ++s) { 1068 alu_node *n = rt.slot(s); 1069 if (!n) 1070 continue; 1071 1072 update_live(n, &vals_born); 1073 } 1074 1075 PSC_DUMP( 1076 sblog << "process_group: live_after : "; 1077 dump::dump_set(sh, live); 1078 sblog << "\n"; 1079 ); 1080 1081 update_local_interferences(); 1082 1083 for (unsigned i = 0; i < 5; ++i) { 1084 node *n = rt.slot(i); 1085 if (n && !n->is_mova()) { 1086 release_src_values(n); 1087 } 1088 } 1089 } 1090 1091 void post_scheduler::init_globals(val_set &s, bool prealloc) { 1092 1093 PSC_DUMP( 1094 sblog << "init_globals: "; 1095 dump::dump_set(sh, s); 1096 sblog << "\n"; 1097 ); 1098 1099 for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) { 1100 value *v = *I; 1101 if (v->is_sgpr() && !v->is_global()) { 1102 v->set_global(); 1103 1104 if (prealloc && v->is_fixed()) { 1105 v->set_prealloc(); 1106 } 1107 } 1108 } 1109 } 1110 1111 void post_scheduler::emit_index_registers() { 1112 for (unsigned i = 0; i < 2; i++) { 1113 if (alu.current_idx[i]) { 1114 regmap = prev_regmap; 1115 alu.discard_current_group(); 1116 1117 load_index_register(alu.current_idx[i], KC_INDEX_0 + i); 1118 alu.current_idx[i] = NULL; 1119 } 1120 } 1121 } 1122 1123 void post_scheduler::emit_clause() { 1124 1125 if (alu.current_ar) { 1126 emit_load_ar(); 1127 process_group(); 1128 alu.emit_group(); 1129 } 1130 1131 if (!alu.is_empty()) { 1132 alu.emit_clause(cur_bb); 1133 } 1134 1135 emit_index_registers(); 1136 } 1137 1138 void post_scheduler::schedule_alu(container_node *c) { 1139 1140 assert(!ready.empty() || !ready_copies.empty()); 1141 1142 while (1) { 1143 1144 prev_regmap = regmap; 1145 1146 if (!prepare_alu_group()) { 1147 if (alu.current_idx[0] || alu.current_idx[1]) { 1148 regmap = prev_regmap; 1149 emit_clause(); 1150 init_globals(live, false); 1151 1152 continue; 1153 } 1154 1155 if (alu.current_ar) { 1156 emit_load_ar(); 1157 continue; 1158 } else 1159 break; 1160 } 1161 1162 if (!alu.check_clause_limits()) { 1163 regmap = prev_regmap; 1164 emit_clause(); 1165 init_globals(live, false); 1166 1167 continue; 1168 } 1169 1170 process_group(); 1171 alu.emit_group(); 1172 }; 1173 1174 if (!alu.is_empty()) { 1175 emit_clause(); 1176 } 1177 1178 if (!ready.empty()) { 1179 sblog << "##post_scheduler: unscheduled ready instructions :"; 1180 dump::dump_op_list(&ready); 1181 assert(!"unscheduled ready instructions"); 1182 } 1183 1184 if (!pending.empty()) { 1185 sblog << "##post_scheduler: unscheduled pending instructions :"; 1186 dump::dump_op_list(&pending); 1187 assert(!"unscheduled pending instructions"); 1188 } 1189 } 1190 1191 void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) { 1192 unsigned chan = v->gpr.chan(); 1193 1194 for (val_set::iterator I = vs.begin(sh), E = vs.end(sh); 1195 I != E; ++I) { 1196 value *vi = *I; 1197 sel_chan gpr = vi->get_final_gpr(); 1198 1199 if (vi->is_any_gpr() && gpr && vi != v && 1200 (!v->chunk || v->chunk != vi->chunk) && 1201 vi->is_fixed() && gpr.chan() == chan) { 1202 1203 unsigned r = gpr.sel(); 1204 1205 PSC_DUMP( 1206 sblog << "\tadd_interferences: " << *vi << "\n"; 1207 ); 1208 1209 if (rb.size() <= r) 1210 rb.resize(r + 32); 1211 rb.set(r); 1212 } 1213 } 1214 } 1215 1216 void post_scheduler::set_color_local_val(value *v, sel_chan color) { 1217 v->gpr = color; 1218 1219 PSC_DUMP( 1220 sblog << " recolored: "; 1221 dump::dump_val(v); 1222 sblog << "\n"; 1223 ); 1224 } 1225 1226 void post_scheduler::set_color_local(value *v, sel_chan color) { 1227 if (v->chunk) { 1228 vvec &vv = v->chunk->values; 1229 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { 1230 value *v2 =*I; 1231 set_color_local_val(v2, color); 1232 } 1233 v->chunk->fix(); 1234 } else { 1235 set_color_local_val(v, color); 1236 v->fix(); 1237 } 1238 } 1239 1240 bool post_scheduler::recolor_local(value *v) { 1241 1242 sb_bitset rb; 1243 1244 assert(v->is_sgpr()); 1245 assert(!v->is_prealloc()); 1246 assert(v->gpr); 1247 1248 unsigned chan = v->gpr.chan(); 1249 1250 PSC_DUMP( 1251 sblog << "recolor_local: "; 1252 dump::dump_val(v); 1253 sblog << " interferences: "; 1254 dump::dump_set(sh, v->interferences); 1255 sblog << "\n"; 1256 if (v->chunk) { 1257 sblog << " in chunk: "; 1258 coalescer::dump_chunk(v->chunk); 1259 sblog << "\n"; 1260 } 1261 ); 1262 1263 if (v->chunk) { 1264 for (vvec::iterator I = v->chunk->values.begin(), 1265 E = v->chunk->values.end(); I != E; ++I) { 1266 value *v2 = *I; 1267 1268 PSC_DUMP( sblog << " add_interferences for " << *v2 << " :\n"; ); 1269 1270 add_interferences(v, rb, v2->interferences); 1271 } 1272 } else { 1273 add_interferences(v, rb, v->interferences); 1274 } 1275 1276 PSC_DUMP( 1277 unsigned sz = rb.size(); 1278 sblog << "registers bits: " << sz; 1279 for (unsigned r = 0; r < sz; ++r) { 1280 if ((r & 7) == 0) 1281 sblog << "\n " << r << " "; 1282 sblog << (rb.get(r) ? 1 : 0); 1283 } 1284 ); 1285 1286 bool no_temp_gprs = v->is_global(); 1287 unsigned rs, re, pass = no_temp_gprs ? 1 : 0; 1288 1289 while (pass < 2) { 1290 1291 if (pass == 0) { 1292 rs = sh.first_temp_gpr(); 1293 re = MAX_GPR; 1294 } else { 1295 rs = 0; 1296 re = sh.num_nontemp_gpr(); 1297 } 1298 1299 for (unsigned reg = rs; reg < re; ++reg) { 1300 if (reg >= rb.size() || !rb.get(reg)) { 1301 // color found 1302 set_color_local(v, sel_chan(reg, chan)); 1303 return true; 1304 } 1305 } 1306 ++pass; 1307 } 1308 1309 assert(!"recolor_local failed"); 1310 return true; 1311 } 1312 1313 void post_scheduler::emit_load_ar() { 1314 1315 regmap = prev_regmap; 1316 alu.discard_current_group(); 1317 1318 alu_group_tracker &rt = alu.grp(); 1319 alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X); 1320 1321 if (!rt.try_reserve(a)) { 1322 sblog << "can't emit AR load : "; 1323 dump::dump_op(a); 1324 sblog << "\n"; 1325 } 1326 1327 alu.current_ar = 0; 1328 } 1329 1330 bool post_scheduler::unmap_dst_val(value *d) { 1331 1332 if (d == alu.current_ar) { 1333 emit_load_ar(); 1334 return false; 1335 } 1336 1337 if (d->is_prealloc()) { 1338 sel_chan gpr = d->get_final_gpr(); 1339 rv_map::iterator F = regmap.find(gpr); 1340 value *c = NULL; 1341 if (F != regmap.end()) 1342 c = F->second; 1343 1344 if (c && c!=d && (!c->chunk || c->chunk != d->chunk)) { 1345 PSC_DUMP( 1346 sblog << "dst value conflict : "; 1347 dump::dump_val(d); 1348 sblog << " regmap contains "; 1349 dump::dump_val(c); 1350 sblog << "\n"; 1351 ); 1352 assert(!"scheduler error"); 1353 return false; 1354 } else if (c) { 1355 regmap.erase(F); 1356 } 1357 } 1358 return true; 1359 } 1360 1361 bool post_scheduler::unmap_dst(alu_node *n) { 1362 value *d = n->dst.empty() ? NULL : n->dst[0]; 1363 1364 if (!d) 1365 return true; 1366 1367 if (!d->is_rel()) { 1368 if (d && d->is_any_reg()) { 1369 1370 if (d->is_AR()) { 1371 if (alu.current_ar != d) { 1372 sblog << "loading wrong ar value\n"; 1373 assert(0); 1374 } else { 1375 alu.current_ar = NULL; 1376 } 1377 1378 } else if (d->is_any_gpr()) { 1379 if (!unmap_dst_val(d)) 1380 return false; 1381 } 1382 } 1383 } else { 1384 for (vvec::iterator I = d->mdef.begin(), E = d->mdef.end(); 1385 I != E; ++I) { 1386 d = *I; 1387 if (!d) 1388 continue; 1389 1390 assert(d->is_any_gpr()); 1391 1392 if (!unmap_dst_val(d)) 1393 return false; 1394 } 1395 } 1396 return true; 1397 } 1398 1399 bool post_scheduler::map_src_val(value *v) { 1400 1401 if (!v->is_prealloc()) 1402 return true; 1403 1404 sel_chan gpr = v->get_final_gpr(); 1405 rv_map::iterator F = regmap.find(gpr); 1406 value *c = NULL; 1407 if (F != regmap.end()) { 1408 c = F->second; 1409 if (!v->v_equal(c)) { 1410 PSC_DUMP( 1411 sblog << "can't map src value "; 1412 dump::dump_val(v); 1413 sblog << ", regmap contains "; 1414 dump::dump_val(c); 1415 sblog << "\n"; 1416 ); 1417 return false; 1418 } 1419 } else { 1420 regmap.insert(std::make_pair(gpr, v)); 1421 } 1422 return true; 1423 } 1424 1425 bool post_scheduler::map_src_vec(vvec &vv, bool src) { 1426 if (src) { 1427 // Handle possible UBO indexing 1428 bool ubo_indexing[2] = { false, false }; 1429 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { 1430 value *v = *I; 1431 if (!v) 1432 continue; 1433 1434 if (v->is_kcache()) { 1435 unsigned index_mode = v->select.kcache_index_mode(); 1436 if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) { 1437 ubo_indexing[index_mode - KC_INDEX_0] = true; 1438 } 1439 } 1440 } 1441 1442 // idx values stored at end of src vec, see bc_parser::prepare_alu_group 1443 for (unsigned i = 2; i != 0; i--) { 1444 if (ubo_indexing[i-1]) { 1445 // TODO: skip adding value to kcache reservation somehow, causes 1446 // unnecessary group breaks and cache line locks 1447 value *v = vv.back(); 1448 if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) { 1449 PSC_DUMP( 1450 sblog << "IDX" << i-1 << " already set to " << 1451 *alu.current_idx[i-1] << ", trying to set " << *v << "\n"; 1452 ); 1453 return false; 1454 } 1455 1456 alu.current_idx[i-1] = v; 1457 PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";); 1458 } 1459 } 1460 } 1461 1462 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { 1463 value *v = *I; 1464 if (!v) 1465 continue; 1466 1467 if ((!v->is_any_gpr() || !v->is_fixed()) && !v->is_rel()) 1468 continue; 1469 1470 if (v->is_rel()) { 1471 value *rel = v->rel; 1472 assert(rel); 1473 1474 if (!rel->is_const()) { 1475 if (!map_src_vec(v->muse, true)) 1476 return false; 1477 1478 if (rel != alu.current_ar) { 1479 if (alu.current_ar) { 1480 PSC_DUMP( 1481 sblog << " current_AR is " << *alu.current_ar 1482 << " trying to use " << *rel << "\n"; 1483 ); 1484 return false; 1485 } 1486 1487 alu.current_ar = rel; 1488 1489 PSC_DUMP( 1490 sblog << " new current_AR assigned: " << *alu.current_ar 1491 << "\n"; 1492 ); 1493 } 1494 } 1495 1496 } else if (src) { 1497 if (!map_src_val(v)) { 1498 return false; 1499 } 1500 } 1501 } 1502 return true; 1503 } 1504 1505 bool post_scheduler::map_src(alu_node *n) { 1506 if (!map_src_vec(n->dst, false)) 1507 return false; 1508 1509 if (!map_src_vec(n->src, true)) 1510 return false; 1511 1512 return true; 1513 } 1514 1515 void post_scheduler::dump_regmap() { 1516 1517 sblog << "# REGMAP :\n"; 1518 1519 for(rv_map::iterator I = regmap.begin(), E = regmap.end(); I != E; ++I) { 1520 sblog << " # " << I->first << " => " << *(I->second) << "\n"; 1521 } 1522 1523 if (alu.current_ar) 1524 sblog << " current_AR: " << *alu.current_ar << "\n"; 1525 if (alu.current_pr) 1526 sblog << " current_PR: " << *alu.current_pr << "\n"; 1527 if (alu.current_idx[0]) 1528 sblog << " current IDX0: " << *alu.current_idx[0] << "\n"; 1529 if (alu.current_idx[1]) 1530 sblog << " current IDX1: " << *alu.current_idx[1] << "\n"; 1531 } 1532 1533 void post_scheduler::recolor_locals() { 1534 alu_group_tracker &rt = alu.grp(); 1535 1536 for (unsigned s = 0; s < ctx.num_slots; ++s) { 1537 alu_node *n = rt.slot(s); 1538 if (n) { 1539 value *d = n->dst[0]; 1540 if (d && d->is_sgpr() && !d->is_prealloc()) { 1541 recolor_local(d); 1542 } 1543 } 1544 } 1545 } 1546 1547 // returns true if there are interferences 1548 bool post_scheduler::check_interferences() { 1549 1550 alu_group_tracker &rt = alu.grp(); 1551 1552 unsigned interf_slots; 1553 1554 bool discarded = false; 1555 1556 PSC_DUMP( 1557 sblog << "check_interferences: before: \n"; 1558 dump_regmap(); 1559 ); 1560 1561 do { 1562 1563 interf_slots = 0; 1564 1565 for (unsigned s = 0; s < ctx.num_slots; ++s) { 1566 alu_node *n = rt.slot(s); 1567 if (n) { 1568 if (!unmap_dst(n)) { 1569 return true; 1570 } 1571 } 1572 } 1573 1574 for (unsigned s = 0; s < ctx.num_slots; ++s) { 1575 alu_node *n = rt.slot(s); 1576 if (n) { 1577 if (!map_src(n)) { 1578 interf_slots |= (1 << s); 1579 } 1580 } 1581 } 1582 1583 PSC_DUMP( 1584 for (unsigned i = 0; i < 5; ++i) { 1585 if (interf_slots & (1 << i)) { 1586 sblog << "!!!!!! interf slot: " << i << " : "; 1587 dump::dump_op(rt.slot(i)); 1588 sblog << "\n"; 1589 } 1590 } 1591 ); 1592 1593 if (!interf_slots) 1594 break; 1595 1596 PSC_DUMP( sblog << "ci: discarding slots " << interf_slots << "\n"; ); 1597 1598 rt.discard_slots(interf_slots, alu.conflict_nodes); 1599 regmap = prev_regmap; 1600 discarded = true; 1601 1602 } while(1); 1603 1604 PSC_DUMP( 1605 sblog << "check_interferences: after: \n"; 1606 dump_regmap(); 1607 ); 1608 1609 return discarded; 1610 } 1611 1612 // add instruction(s) (alu_node or contents of alu_packed_node) to current group 1613 // returns the number of added instructions on success 1614 unsigned post_scheduler::try_add_instruction(node *n) { 1615 1616 alu_group_tracker &rt = alu.grp(); 1617 1618 unsigned avail_slots = rt.avail_slots(); 1619 1620 // Cannot schedule in same clause as instructions using this index value 1621 if (!n->dst.empty() && n->dst[0] && 1622 (n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) { 1623 PSC_DUMP(sblog << " CF_IDX source: " << *n->dst[0] << "\n";); 1624 return 0; 1625 } 1626 1627 if (n->is_alu_packed()) { 1628 alu_packed_node *p = static_cast<alu_packed_node*>(n); 1629 unsigned slots = p->get_slot_mask(); 1630 unsigned cnt = __builtin_popcount(slots); 1631 1632 if ((slots & avail_slots) != slots) { 1633 PSC_DUMP( sblog << " no slots \n"; ); 1634 return 0; 1635 } 1636 1637 p->update_packed_items(ctx); 1638 1639 if (!rt.try_reserve(p)) { 1640 PSC_DUMP( sblog << " reservation failed \n"; ); 1641 return 0; 1642 } 1643 1644 p->remove(); 1645 return cnt; 1646 1647 } else { 1648 alu_node *a = static_cast<alu_node*>(n); 1649 value *d = a->dst.empty() ? NULL : a->dst[0]; 1650 1651 if (d && d->is_special_reg()) { 1652 assert((a->bc.op_ptr->flags & AF_MOVA) || d->is_geometry_emit()); 1653 d = NULL; 1654 } 1655 1656 unsigned allowed_slots = ctx.alu_slots_mask(a->bc.op_ptr); 1657 unsigned slot; 1658 1659 allowed_slots &= avail_slots; 1660 1661 if (!allowed_slots) 1662 return 0; 1663 1664 if (d) { 1665 slot = d->get_final_chan(); 1666 a->bc.dst_chan = slot; 1667 allowed_slots &= (1 << slot) | 0x10; 1668 } else { 1669 if (a->bc.op_ptr->flags & AF_MOVA) { 1670 if (a->bc.slot_flags & AF_V) 1671 allowed_slots &= (1 << SLOT_X); 1672 else 1673 allowed_slots &= (1 << SLOT_TRANS); 1674 } 1675 } 1676 1677 // FIXME workaround for some problems with MULADD in trans slot on r700, 1678 // (is it really needed on r600?) 1679 if ((a->bc.op == ALU_OP3_MULADD || a->bc.op == ALU_OP3_MULADD_IEEE) && 1680 !ctx.is_egcm()) { 1681 allowed_slots &= 0x0F; 1682 } 1683 1684 if (!allowed_slots) { 1685 PSC_DUMP( sblog << " no suitable slots\n"; ); 1686 return 0; 1687 } 1688 1689 slot = __builtin_ctz(allowed_slots); 1690 a->bc.slot = slot; 1691 1692 PSC_DUMP( sblog << "slot: " << slot << "\n"; ); 1693 1694 if (!rt.try_reserve(a)) { 1695 PSC_DUMP( sblog << " reservation failed\n"; ); 1696 return 0; 1697 } 1698 1699 a->remove(); 1700 return 1; 1701 } 1702 } 1703 1704 bool post_scheduler::check_copy(node *n) { 1705 if (!n->is_copy_mov()) 1706 return false; 1707 1708 value *s = n->src[0]; 1709 value *d = n->dst[0]; 1710 1711 if (!s->is_sgpr() || !d->is_sgpr()) 1712 return false; 1713 1714 if (!s->is_prealloc()) { 1715 recolor_local(s); 1716 1717 if (!s->chunk || s->chunk != d->chunk) 1718 return false; 1719 } 1720 1721 if (s->gpr == d->gpr) { 1722 1723 PSC_DUMP( 1724 sblog << "check_copy: "; 1725 dump::dump_op(n); 1726 sblog << "\n"; 1727 ); 1728 1729 rv_map::iterator F = regmap.find(d->gpr); 1730 bool gpr_free = (F == regmap.end()); 1731 1732 if (d->is_prealloc()) { 1733 if (gpr_free) { 1734 PSC_DUMP( sblog << " copy not ready...\n";); 1735 return true; 1736 } 1737 1738 value *rv = F->second; 1739 if (rv != d && (!rv->chunk || rv->chunk != d->chunk)) { 1740 PSC_DUMP( sblog << " copy not ready(2)...\n";); 1741 return true; 1742 } 1743 1744 unmap_dst(static_cast<alu_node*>(n)); 1745 } 1746 1747 if (s->is_prealloc() && !map_src_val(s)) 1748 return true; 1749 1750 update_live(n, NULL); 1751 1752 release_src_values(n); 1753 n->remove(); 1754 PSC_DUMP( sblog << " copy coalesced...\n";); 1755 return true; 1756 } 1757 return false; 1758 } 1759 1760 void post_scheduler::dump_group(alu_group_tracker &rt) { 1761 for (unsigned i = 0; i < 5; ++i) { 1762 node *n = rt.slot(i); 1763 if (n) { 1764 sblog << "slot " << i << " : "; 1765 dump::dump_op(n); 1766 sblog << "\n"; 1767 } 1768 } 1769 } 1770 1771 void post_scheduler::process_ready_copies() { 1772 1773 node *last; 1774 1775 do { 1776 last = ready_copies.back(); 1777 1778 for (node_iterator N, I = ready_copies.begin(), E = ready_copies.end(); 1779 I != E; I = N) { 1780 N = I; ++N; 1781 1782 node *n = *I; 1783 1784 if (!check_copy(n)) { 1785 n->remove(); 1786 ready.push_back(n); 1787 } 1788 } 1789 } while (last != ready_copies.back()); 1790 1791 update_local_interferences(); 1792 } 1793 1794 1795 bool post_scheduler::prepare_alu_group() { 1796 1797 alu_group_tracker &rt = alu.grp(); 1798 1799 unsigned i1 = 0; 1800 1801 PSC_DUMP( 1802 sblog << "prepare_alu_group: starting...\n"; 1803 dump_group(rt); 1804 ); 1805 1806 ready.append_from(&alu.conflict_nodes); 1807 1808 // FIXME rework this loop 1809 1810 do { 1811 1812 process_ready_copies(); 1813 1814 ++i1; 1815 1816 for (node_iterator N, I = ready.begin(), E = ready.end(); I != E; 1817 I = N) { 1818 N = I; ++N; 1819 node *n = *I; 1820 1821 PSC_DUMP( 1822 sblog << "p_a_g: "; 1823 dump::dump_op(n); 1824 sblog << "\n"; 1825 ); 1826 1827 1828 unsigned cnt = try_add_instruction(n); 1829 1830 if (!cnt) 1831 continue; 1832 1833 PSC_DUMP( 1834 sblog << "current group:\n"; 1835 dump_group(rt); 1836 ); 1837 1838 if (rt.inst_count() == ctx.num_slots) { 1839 PSC_DUMP( sblog << " all slots used\n"; ); 1840 break; 1841 } 1842 } 1843 1844 if (!check_interferences()) 1845 break; 1846 1847 // don't try to add more instructions to the group with mova if this 1848 // can lead to breaking clause slot count limit - we don't want mova to 1849 // end up in the end of the new clause instead of beginning of the 1850 // current clause. 1851 if (rt.has_ar_load() && alu.total_slots() > 121) 1852 break; 1853 1854 if (rt.inst_count() && i1 > 50) 1855 break; 1856 1857 regmap = prev_regmap; 1858 1859 } while (1); 1860 1861 PSC_DUMP( 1862 sblog << " prepare_alu_group done, " << rt.inst_count() 1863 << " slot(s) \n"; 1864 1865 sblog << "$$$$$$$$PAG i1=" << i1 1866 << " ready " << ready.count() 1867 << " pending " << pending.count() 1868 << " conflicting " << alu.conflict_nodes.count() 1869 <<"\n"; 1870 1871 ); 1872 1873 return rt.inst_count(); 1874 } 1875 1876 void post_scheduler::release_src_values(node* n) { 1877 release_src_vec(n->src, true); 1878 release_src_vec(n->dst, false); 1879 } 1880 1881 void post_scheduler::release_op(node *n) { 1882 PSC_DUMP( 1883 sblog << "release_op "; 1884 dump::dump_op(n); 1885 sblog << "\n"; 1886 ); 1887 1888 n->remove(); 1889 1890 if (n->is_copy_mov()) { 1891 ready_copies.push_back(n); 1892 } else if (n->is_mova() || n->is_pred_set()) { 1893 ready.push_front(n); 1894 } else { 1895 ready.push_back(n); 1896 } 1897 } 1898 1899 void post_scheduler::release_src_val(value *v) { 1900 node *d = v->any_def(); 1901 if (d) { 1902 if (!--ucm[d]) 1903 release_op(d); 1904 } 1905 } 1906 1907 void post_scheduler::release_src_vec(vvec& vv, bool src) { 1908 1909 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { 1910 value *v = *I; 1911 if (!v || v->is_readonly()) 1912 continue; 1913 1914 if (v->is_rel()) { 1915 release_src_val(v->rel); 1916 release_src_vec(v->muse, true); 1917 1918 } else if (src) { 1919 release_src_val(v); 1920 } 1921 } 1922 } 1923 1924 void literal_tracker::reset() { 1925 memset(lt, 0, sizeof(lt)); 1926 memset(uc, 0, sizeof(uc)); 1927 } 1928 1929 void rp_gpr_tracker::reset() { 1930 memset(rp, 0, sizeof(rp)); 1931 memset(uc, 0, sizeof(uc)); 1932 } 1933 1934 void rp_kcache_tracker::reset() { 1935 memset(rp, 0, sizeof(rp)); 1936 memset(uc, 0, sizeof(uc)); 1937 } 1938 1939 void alu_kcache_tracker::reset() { 1940 memset(kc, 0, sizeof(kc)); 1941 lines.clear(); 1942 } 1943 1944 void alu_clause_tracker::reset() { 1945 group = 0; 1946 slot_count = 0; 1947 grp0.reset(); 1948 grp1.reset(); 1949 } 1950 1951 alu_clause_tracker::alu_clause_tracker(shader &sh) 1952 : sh(sh), kt(sh.get_ctx().hw_class), slot_count(), 1953 grp0(sh), grp1(sh), 1954 group(), clause(), 1955 push_exec_mask(), 1956 current_ar(), current_pr(), current_idx() {} 1957 1958 void alu_clause_tracker::emit_group() { 1959 1960 assert(grp().inst_count()); 1961 1962 alu_group_node *g = grp().emit(); 1963 1964 if (grp().has_update_exec_mask()) { 1965 assert(!push_exec_mask); 1966 push_exec_mask = true; 1967 } 1968 1969 assert(g); 1970 1971 if (!clause) { 1972 clause = sh.create_clause(NST_ALU_CLAUSE); 1973 } 1974 1975 clause->push_front(g); 1976 1977 slot_count += grp().slot_count(); 1978 1979 new_group(); 1980 1981 PSC_DUMP( sblog << " #### group emitted\n"; ); 1982 } 1983 1984 void alu_clause_tracker::emit_clause(container_node *c) { 1985 assert(clause); 1986 1987 kt.init_clause(clause->bc); 1988 1989 assert(!current_ar); 1990 assert(!current_pr); 1991 1992 if (push_exec_mask) 1993 clause->bc.set_op(CF_OP_ALU_PUSH_BEFORE); 1994 1995 c->push_front(clause); 1996 1997 clause = NULL; 1998 push_exec_mask = false; 1999 slot_count = 0; 2000 kt.reset(); 2001 2002 PSC_DUMP( sblog << "######### ALU clause emitted\n"; ); 2003 } 2004 2005 bool alu_clause_tracker::check_clause_limits() { 2006 2007 alu_group_tracker > = grp(); 2008 2009 unsigned slots = gt.slot_count(); 2010 2011 // reserving slots to load AR and PR values 2012 unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0); 2013 // ...and index registers 2014 reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL); 2015 2016 if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots) 2017 return false; 2018 2019 if (!kt.try_reserve(gt)) 2020 return false; 2021 2022 return true; 2023 } 2024 2025 void alu_clause_tracker::new_group() { 2026 group = !group; 2027 grp().reset(); 2028 } 2029 2030 bool alu_clause_tracker::is_empty() { 2031 return clause == NULL; 2032 } 2033 2034 void literal_tracker::init_group_literals(alu_group_node* g) { 2035 2036 g->literals.clear(); 2037 for (unsigned i = 0; i < 4; ++i) { 2038 if (!lt[i]) 2039 break; 2040 2041 g->literals.push_back(lt[i]); 2042 2043 PSC_DUMP( 2044 sblog << "literal emitted: " << lt[i].f; 2045 sblog.print_zw_hex(lt[i].u, 8); 2046 sblog << " " << lt[i].i << "\n"; 2047 ); 2048 } 2049 } 2050 2051 bool alu_kcache_tracker::try_reserve(alu_group_tracker& gt) { 2052 rp_kcache_tracker &kt = gt.kcache(); 2053 2054 if (!kt.num_sels()) 2055 return true; 2056 2057 sb_set<unsigned> group_lines; 2058 2059 unsigned nl = kt.get_lines(group_lines); 2060 assert(nl); 2061 2062 sb_set<unsigned> clause_lines(lines); 2063 lines.add_set(group_lines); 2064 2065 if (clause_lines.size() == lines.size()) 2066 return true; 2067 2068 if (update_kc()) 2069 return true; 2070 2071 lines = clause_lines; 2072 2073 return false; 2074 } 2075 2076 unsigned rp_kcache_tracker::get_lines(kc_lines& lines) { 2077 unsigned cnt = 0; 2078 2079 for (unsigned i = 0; i < sel_count; ++i) { 2080 unsigned line = rp[i] & 0x1fffffffu; 2081 unsigned index_mode = rp[i] >> 29; 2082 2083 if (!line) 2084 return cnt; 2085 2086 --line; 2087 line = (sel_count == 2) ? line >> 5 : line >> 6; 2088 line |= index_mode << 29; 2089 2090 if (lines.insert(line).second) 2091 ++cnt; 2092 } 2093 return cnt; 2094 } 2095 2096 bool alu_kcache_tracker::update_kc() { 2097 unsigned c = 0; 2098 2099 bc_kcache old_kc[4]; 2100 memcpy(old_kc, kc, sizeof(kc)); 2101 2102 for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) { 2103 unsigned index_mode = *I >> 29; 2104 unsigned line = *I & 0x1fffffffu; 2105 unsigned bank = line >> 8; 2106 2107 assert(index_mode <= KC_INDEX_INVALID); 2108 line &= 0xFF; 2109 2110 if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) && 2111 kc[c-1].index_mode == index_mode) 2112 { 2113 kc[c-1].mode = KC_LOCK_2; 2114 } else { 2115 if (c == max_kcs) { 2116 memcpy(kc, old_kc, sizeof(kc)); 2117 return false; 2118 } 2119 2120 kc[c].mode = KC_LOCK_1; 2121 2122 kc[c].bank = bank; 2123 kc[c].addr = line; 2124 kc[c].index_mode = index_mode; 2125 ++c; 2126 } 2127 } 2128 return true; 2129 } 2130 2131 alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) { 2132 alu_node *a = sh.create_alu(); 2133 2134 if (sh.get_ctx().uses_mova_gpr) { 2135 a->bc.set_op(ALU_OP1_MOVA_GPR_INT); 2136 a->bc.slot = SLOT_TRANS; 2137 } else { 2138 a->bc.set_op(ALU_OP1_MOVA_INT); 2139 a->bc.slot = SLOT_X; 2140 } 2141 a->bc.dst_chan = ar_channel; 2142 if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) { 2143 a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1; 2144 } 2145 2146 a->dst.resize(1); 2147 a->src.push_back(v); 2148 2149 PSC_DUMP( 2150 sblog << "created AR load: "; 2151 dump::dump_op(a); 2152 sblog << "\n"; 2153 ); 2154 2155 return a; 2156 } 2157 2158 void alu_clause_tracker::discard_current_group() { 2159 PSC_DUMP( sblog << "act::discard_current_group\n"; ); 2160 grp().discard_all_slots(conflict_nodes); 2161 } 2162 2163 void rp_gpr_tracker::dump() { 2164 sblog << "=== gpr_tracker dump:\n"; 2165 for (int c = 0; c < 3; ++c) { 2166 sblog << "cycle " << c << " "; 2167 for (int h = 0; h < 4; ++h) { 2168 sblog << rp[c][h] << ":" << uc[c][h] << " "; 2169 } 2170 sblog << "\n"; 2171 } 2172 } 2173 2174 } // namespace r600_sb 2175