1 /* 2 * Copyright 2012 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 /** @file brw_fs_copy_propagation.cpp 25 * 26 * Support for global copy propagation in two passes: A local pass that does 27 * intra-block copy (and constant) propagation, and a global pass that uses 28 * dataflow analysis on the copies available at the end of each block to re-do 29 * local copy propagation with more copies available. 30 * 31 * See Muchnick's Advanced Compiler Design and Implementation, section 32 * 12.5 (p356). 33 */ 34 35 #define ACP_HASH_SIZE 16 36 37 #include "util/bitset.h" 38 #include "brw_fs.h" 39 #include "brw_cfg.h" 40 #include "brw_eu.h" 41 42 namespace { /* avoid conflict with opt_copy_propagation_elements */ 43 struct acp_entry : public exec_node { 44 fs_reg dst; 45 fs_reg src; 46 uint8_t size_written; 47 uint8_t size_read; 48 enum opcode opcode; 49 bool saturate; 50 }; 51 52 struct block_data { 53 /** 54 * Which entries in the fs_copy_prop_dataflow acp table are live at the 55 * start of this block. This is the useful output of the analysis, since 56 * it lets us plug those into the local copy propagation on the second 57 * pass. 58 */ 59 BITSET_WORD *livein; 60 61 /** 62 * Which entries in the fs_copy_prop_dataflow acp table are live at the end 63 * of this block. This is done in initial setup from the per-block acps 64 * returned by the first local copy prop pass. 65 */ 66 BITSET_WORD *liveout; 67 68 /** 69 * Which entries in the fs_copy_prop_dataflow acp table are generated by 70 * instructions in this block which reach the end of the block without 71 * being killed. 72 */ 73 BITSET_WORD *copy; 74 75 /** 76 * Which entries in the fs_copy_prop_dataflow acp table are killed over the 77 * course of this block. 78 */ 79 BITSET_WORD *kill; 80 }; 81 82 class fs_copy_prop_dataflow 83 { 84 public: 85 fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, 86 exec_list *out_acp[ACP_HASH_SIZE]); 87 88 void setup_initial_values(); 89 void run(); 90 91 void dump_block_data() const UNUSED; 92 93 void *mem_ctx; 94 cfg_t *cfg; 95 96 acp_entry **acp; 97 int num_acp; 98 int bitset_words; 99 100 struct block_data *bd; 101 }; 102 } /* anonymous namespace */ 103 104 fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, 105 exec_list *out_acp[ACP_HASH_SIZE]) 106 : mem_ctx(mem_ctx), cfg(cfg) 107 { 108 bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); 109 110 num_acp = 0; 111 foreach_block (block, cfg) { 112 for (int i = 0; i < ACP_HASH_SIZE; i++) { 113 num_acp += out_acp[block->num][i].length(); 114 } 115 } 116 117 acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp); 118 119 bitset_words = BITSET_WORDS(num_acp); 120 121 int next_acp = 0; 122 foreach_block (block, cfg) { 123 bd[block->num].livein = rzalloc_array(bd, BITSET_WORD, bitset_words); 124 bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); 125 bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words); 126 bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words); 127 128 for (int i = 0; i < ACP_HASH_SIZE; i++) { 129 foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) { 130 acp[next_acp] = entry; 131 132 /* opt_copy_propagation_local populates out_acp with copies created 133 * in a block which are still live at the end of the block. This 134 * is exactly what we want in the COPY set. 135 */ 136 BITSET_SET(bd[block->num].copy, next_acp); 137 138 next_acp++; 139 } 140 } 141 } 142 143 assert(next_acp == num_acp); 144 145 setup_initial_values(); 146 run(); 147 } 148 149 /** 150 * Set up initial values for each of the data flow sets, prior to running 151 * the fixed-point algorithm. 152 */ 153 void 154 fs_copy_prop_dataflow::setup_initial_values() 155 { 156 /* Initialize the COPY and KILL sets. */ 157 foreach_block (block, cfg) { 158 foreach_inst_in_block(fs_inst, inst, block) { 159 if (inst->dst.file != VGRF) 160 continue; 161 162 /* Mark ACP entries which are killed by this instruction. */ 163 for (int i = 0; i < num_acp; i++) { 164 if (regions_overlap(inst->dst, inst->size_written, 165 acp[i]->dst, acp[i]->size_written) || 166 regions_overlap(inst->dst, inst->size_written, 167 acp[i]->src, acp[i]->size_read)) { 168 BITSET_SET(bd[block->num].kill, i); 169 } 170 } 171 } 172 } 173 174 /* Populate the initial values for the livein and liveout sets. For the 175 * block at the start of the program, livein = 0 and liveout = copy. 176 * For the others, set liveout to 0 (the empty set) and livein to ~0 177 * (the universal set). 178 */ 179 foreach_block (block, cfg) { 180 if (block->parents.is_empty()) { 181 for (int i = 0; i < bitset_words; i++) { 182 bd[block->num].livein[i] = 0u; 183 bd[block->num].liveout[i] = bd[block->num].copy[i]; 184 } 185 } else { 186 for (int i = 0; i < bitset_words; i++) { 187 bd[block->num].liveout[i] = 0u; 188 bd[block->num].livein[i] = ~0u; 189 } 190 } 191 } 192 } 193 194 /** 195 * Walk the set of instructions in the block, marking which entries in the acp 196 * are killed by the block. 197 */ 198 void 199 fs_copy_prop_dataflow::run() 200 { 201 bool progress; 202 203 do { 204 progress = false; 205 206 /* Update liveout for all blocks. */ 207 foreach_block (block, cfg) { 208 if (block->parents.is_empty()) 209 continue; 210 211 for (int i = 0; i < bitset_words; i++) { 212 const BITSET_WORD old_liveout = bd[block->num].liveout[i]; 213 214 bd[block->num].liveout[i] = 215 bd[block->num].copy[i] | (bd[block->num].livein[i] & 216 ~bd[block->num].kill[i]); 217 218 if (old_liveout != bd[block->num].liveout[i]) 219 progress = true; 220 } 221 } 222 223 /* Update livein for all blocks. If a copy is live out of all parent 224 * blocks, it's live coming in to this block. 225 */ 226 foreach_block (block, cfg) { 227 if (block->parents.is_empty()) 228 continue; 229 230 for (int i = 0; i < bitset_words; i++) { 231 const BITSET_WORD old_livein = bd[block->num].livein[i]; 232 233 bd[block->num].livein[i] = ~0u; 234 foreach_list_typed(bblock_link, parent_link, link, &block->parents) { 235 bblock_t *parent = parent_link->block; 236 bd[block->num].livein[i] &= bd[parent->num].liveout[i]; 237 } 238 239 if (old_livein != bd[block->num].livein[i]) 240 progress = true; 241 } 242 } 243 } while (progress); 244 } 245 246 void 247 fs_copy_prop_dataflow::dump_block_data() const 248 { 249 foreach_block (block, cfg) { 250 fprintf(stderr, "Block %d [%d, %d] (parents ", block->num, 251 block->start_ip, block->end_ip); 252 foreach_list_typed(bblock_link, link, link, &block->parents) { 253 bblock_t *parent = link->block; 254 fprintf(stderr, "%d ", parent->num); 255 } 256 fprintf(stderr, "):\n"); 257 fprintf(stderr, " livein = 0x"); 258 for (int i = 0; i < bitset_words; i++) 259 fprintf(stderr, "%08x", bd[block->num].livein[i]); 260 fprintf(stderr, ", liveout = 0x"); 261 for (int i = 0; i < bitset_words; i++) 262 fprintf(stderr, "%08x", bd[block->num].liveout[i]); 263 fprintf(stderr, ",\n copy = 0x"); 264 for (int i = 0; i < bitset_words; i++) 265 fprintf(stderr, "%08x", bd[block->num].copy[i]); 266 fprintf(stderr, ", kill = 0x"); 267 for (int i = 0; i < bitset_words; i++) 268 fprintf(stderr, "%08x", bd[block->num].kill[i]); 269 fprintf(stderr, "\n"); 270 } 271 } 272 273 static bool 274 is_logic_op(enum opcode opcode) 275 { 276 return (opcode == BRW_OPCODE_AND || 277 opcode == BRW_OPCODE_OR || 278 opcode == BRW_OPCODE_XOR || 279 opcode == BRW_OPCODE_NOT); 280 } 281 282 static bool 283 can_take_stride(fs_inst *inst, unsigned arg, unsigned stride, 284 const gen_device_info *devinfo) 285 { 286 if (stride > 4) 287 return false; 288 289 /* 3-source instructions can only be Align16, which restricts what strides 290 * they can take. They can only take a stride of 1 (the usual case), or 0 291 * with a special "repctrl" bit. But the repctrl bit doesn't work for 292 * 64-bit datatypes, so if the source type is 64-bit then only a stride of 293 * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page 294 * 944: 295 * 296 * This is applicable to 32b datatypes and 16b datatype. 64b datatypes 297 * cannot use the replicate control. 298 */ 299 if (inst->is_3src(devinfo)) { 300 if (type_sz(inst->src[arg].type) > 4) 301 return stride == 1; 302 else 303 return stride == 1 || stride == 0; 304 } 305 306 /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions", 307 * page 391 ("Extended Math Function"): 308 * 309 * The following restrictions apply for align1 mode: Scalar source is 310 * supported. Source and destination horizontal stride must be the 311 * same. 312 * 313 * From the Haswell PRM Volume 2b "Command Reference - Instructions", page 314 * 134 ("Extended Math Function"): 315 * 316 * Scalar source is supported. Source and destination horizontal stride 317 * must be 1. 318 * 319 * and similar language exists for IVB and SNB. Pre-SNB, math instructions 320 * are sends, so the sources are moved to MRF's and there are no 321 * restrictions. 322 */ 323 if (inst->is_math()) { 324 if (devinfo->gen == 6 || devinfo->gen == 7) { 325 assert(inst->dst.stride == 1); 326 return stride == 1 || stride == 0; 327 } else if (devinfo->gen >= 8) { 328 return stride == inst->dst.stride || stride == 0; 329 } 330 } 331 332 return true; 333 } 334 335 bool 336 fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) 337 { 338 if (inst->src[arg].file != VGRF) 339 return false; 340 341 if (entry->src.file == IMM) 342 return false; 343 assert(entry->src.file == VGRF || entry->src.file == UNIFORM || 344 entry->src.file == ATTR); 345 346 if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && 347 inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) 348 return false; 349 350 assert(entry->dst.file == VGRF); 351 if (inst->src[arg].nr != entry->dst.nr) 352 return false; 353 354 /* Bail if inst is reading a range that isn't contained in the range 355 * that entry is writing. 356 */ 357 if (!region_contained_in(inst->src[arg], inst->size_read(arg), 358 entry->dst, entry->size_written)) 359 return false; 360 361 /* we can't generally copy-propagate UD negations because we 362 * can end up accessing the resulting values as signed integers 363 * instead. See also resolve_ud_negate() and comment in 364 * fs_generator::generate_code. 365 */ 366 if (entry->src.type == BRW_REGISTER_TYPE_UD && 367 entry->src.negate) 368 return false; 369 370 bool has_source_modifiers = entry->src.abs || entry->src.negate; 371 372 if ((has_source_modifiers || entry->src.file == UNIFORM || 373 !entry->src.is_contiguous()) && 374 !inst->can_do_source_mods(devinfo)) 375 return false; 376 377 if (has_source_modifiers && 378 inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE) 379 return false; 380 381 /* Bail if the result of composing both strides would exceed the 382 * hardware limit. 383 */ 384 if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride, 385 devinfo)) 386 return false; 387 388 /* Bail if the instruction type is larger than the execution type of the 389 * copy, what implies that each channel is reading multiple channels of the 390 * destination of the copy, and simply replacing the sources would give a 391 * program with different semantics. 392 */ 393 if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type)) 394 return false; 395 396 /* Bail if the result of composing both strides cannot be expressed 397 * as another stride. This avoids, for example, trying to transform 398 * this: 399 * 400 * MOV (8) rX<1>UD rY<0;1,0>UD 401 * FOO (8) ... rX<8;8,1>UW 402 * 403 * into this: 404 * 405 * FOO (8) ... rY<0;1,0>UW 406 * 407 * Which would have different semantics. 408 */ 409 if (entry->src.stride != 1 && 410 (inst->src[arg].stride * 411 type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) 412 return false; 413 414 /* Since semantics of source modifiers are type-dependent we need to 415 * ensure that the meaning of the instruction remains the same if we 416 * change the type. If the sizes of the types are different the new 417 * instruction will read a different amount of data than the original 418 * and the semantics will always be different. 419 */ 420 if (has_source_modifiers && 421 entry->dst.type != inst->src[arg].type && 422 (!inst->can_change_types() || 423 type_sz(entry->dst.type) != type_sz(inst->src[arg].type))) 424 return false; 425 426 if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) && 427 is_logic_op(inst->opcode)) { 428 return false; 429 } 430 431 if (entry->saturate) { 432 switch(inst->opcode) { 433 case BRW_OPCODE_SEL: 434 if ((inst->conditional_mod != BRW_CONDITIONAL_GE && 435 inst->conditional_mod != BRW_CONDITIONAL_L) || 436 inst->src[1].file != IMM || 437 inst->src[1].f < 0.0 || 438 inst->src[1].f > 1.0) { 439 return false; 440 } 441 break; 442 default: 443 return false; 444 } 445 } 446 447 inst->src[arg].file = entry->src.file; 448 inst->src[arg].nr = entry->src.nr; 449 inst->src[arg].stride *= entry->src.stride; 450 inst->saturate = inst->saturate || entry->saturate; 451 452 /* Compute the offset of inst->src[arg] relative to entry->dst */ 453 const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; 454 455 /* Compute the first component of the copy that the instruction is 456 * reading, and the base byte offset within that component. 457 */ 458 assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1); 459 const unsigned component = rel_offset / type_sz(entry->dst.type); 460 const unsigned suboffset = rel_offset % type_sz(entry->dst.type); 461 462 /* Calculate the byte offset at the origin of the copy of the given 463 * component and suboffset. 464 */ 465 inst->src[arg].offset = suboffset + 466 component * entry->src.stride * type_sz(entry->src.type) + 467 entry->src.offset; 468 469 if (has_source_modifiers) { 470 if (entry->dst.type != inst->src[arg].type) { 471 /* We are propagating source modifiers from a MOV with a different 472 * type. If we got here, then we can just change the source and 473 * destination types of the instruction and keep going. 474 */ 475 assert(inst->can_change_types()); 476 for (int i = 0; i < inst->sources; i++) { 477 inst->src[i].type = entry->dst.type; 478 } 479 inst->dst.type = entry->dst.type; 480 } 481 482 if (!inst->src[arg].abs) { 483 inst->src[arg].abs = entry->src.abs; 484 inst->src[arg].negate ^= entry->src.negate; 485 } 486 } 487 488 return true; 489 } 490 491 492 bool 493 fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) 494 { 495 bool progress = false; 496 497 if (entry->src.file != IMM) 498 return false; 499 if (type_sz(entry->src.type) > 4) 500 return false; 501 if (entry->saturate) 502 return false; 503 504 for (int i = inst->sources - 1; i >= 0; i--) { 505 if (inst->src[i].file != VGRF) 506 continue; 507 508 assert(entry->dst.file == VGRF); 509 if (inst->src[i].nr != entry->dst.nr) 510 continue; 511 512 /* Bail if inst is reading a range that isn't contained in the range 513 * that entry is writing. 514 */ 515 if (!region_contained_in(inst->src[i], inst->size_read(i), 516 entry->dst, entry->size_written)) 517 continue; 518 519 /* If the type sizes don't match each channel of the instruction is 520 * either extracting a portion of the constant (which could be handled 521 * with some effort but the code below doesn't) or reading multiple 522 * channels of the source at once. 523 */ 524 if (type_sz(inst->src[i].type) != type_sz(entry->dst.type)) 525 continue; 526 527 fs_reg val = entry->src; 528 val.type = inst->src[i].type; 529 530 if (inst->src[i].abs) { 531 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || 532 !brw_abs_immediate(val.type, &val.as_brw_reg())) { 533 continue; 534 } 535 } 536 537 if (inst->src[i].negate) { 538 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || 539 !brw_negate_immediate(val.type, &val.as_brw_reg())) { 540 continue; 541 } 542 } 543 544 switch (inst->opcode) { 545 case BRW_OPCODE_MOV: 546 case SHADER_OPCODE_LOAD_PAYLOAD: 547 case FS_OPCODE_PACK: 548 inst->src[i] = val; 549 progress = true; 550 break; 551 552 case SHADER_OPCODE_INT_QUOTIENT: 553 case SHADER_OPCODE_INT_REMAINDER: 554 /* FINISHME: Promote non-float constants and remove this. */ 555 if (devinfo->gen < 8) 556 break; 557 /* fallthrough */ 558 case SHADER_OPCODE_POW: 559 /* Allow constant propagation into src1 (except on Gen 6 which 560 * doesn't support scalar source math), and let constant combining 561 * promote the constant on Gen < 8. 562 */ 563 if (devinfo->gen == 6) 564 break; 565 /* fallthrough */ 566 case BRW_OPCODE_BFI1: 567 case BRW_OPCODE_ASR: 568 case BRW_OPCODE_SHL: 569 case BRW_OPCODE_SHR: 570 case BRW_OPCODE_SUBB: 571 if (i == 1) { 572 inst->src[i] = val; 573 progress = true; 574 } 575 break; 576 577 case BRW_OPCODE_MACH: 578 case BRW_OPCODE_MUL: 579 case SHADER_OPCODE_MULH: 580 case BRW_OPCODE_ADD: 581 case BRW_OPCODE_OR: 582 case BRW_OPCODE_AND: 583 case BRW_OPCODE_XOR: 584 case BRW_OPCODE_ADDC: 585 if (i == 1) { 586 inst->src[i] = val; 587 progress = true; 588 } else if (i == 0 && inst->src[1].file != IMM) { 589 /* Fit this constant in by commuting the operands. 590 * Exception: we can't do this for 32-bit integer MUL/MACH 591 * because it's asymmetric. 592 * 593 * The BSpec says for Broadwell that 594 * 595 * "When multiplying DW x DW, the dst cannot be accumulator." 596 * 597 * Integer MUL with a non-accumulator destination will be lowered 598 * by lower_integer_multiplication(), so don't restrict it. 599 */ 600 if (((inst->opcode == BRW_OPCODE_MUL && 601 inst->dst.is_accumulator()) || 602 inst->opcode == BRW_OPCODE_MACH) && 603 (inst->src[1].type == BRW_REGISTER_TYPE_D || 604 inst->src[1].type == BRW_REGISTER_TYPE_UD)) 605 break; 606 inst->src[0] = inst->src[1]; 607 inst->src[1] = val; 608 progress = true; 609 } 610 break; 611 612 case BRW_OPCODE_CMP: 613 case BRW_OPCODE_IF: 614 if (i == 1) { 615 inst->src[i] = val; 616 progress = true; 617 } else if (i == 0 && inst->src[1].file != IMM) { 618 enum brw_conditional_mod new_cmod; 619 620 new_cmod = brw_swap_cmod(inst->conditional_mod); 621 if (new_cmod != BRW_CONDITIONAL_NONE) { 622 /* Fit this constant in by swapping the operands and 623 * flipping the test 624 */ 625 inst->src[0] = inst->src[1]; 626 inst->src[1] = val; 627 inst->conditional_mod = new_cmod; 628 progress = true; 629 } 630 } 631 break; 632 633 case BRW_OPCODE_SEL: 634 if (i == 1) { 635 inst->src[i] = val; 636 progress = true; 637 } else if (i == 0 && inst->src[1].file != IMM) { 638 inst->src[0] = inst->src[1]; 639 inst->src[1] = val; 640 641 /* If this was predicated, flipping operands means 642 * we also need to flip the predicate. 643 */ 644 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { 645 inst->predicate_inverse = 646 !inst->predicate_inverse; 647 } 648 progress = true; 649 } 650 break; 651 652 case SHADER_OPCODE_UNTYPED_ATOMIC: 653 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 654 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 655 case SHADER_OPCODE_TYPED_ATOMIC: 656 case SHADER_OPCODE_TYPED_SURFACE_READ: 657 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 658 /* We only propagate into the surface argument of the 659 * instruction. Everything else goes through LOAD_PAYLOAD. 660 */ 661 if (i == 1) { 662 inst->src[i] = val; 663 progress = true; 664 } 665 break; 666 667 case FS_OPCODE_FB_WRITE_LOGICAL: 668 /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are 669 * bit-cast using a strided region so they cannot be immediates. 670 */ 671 if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL && 672 i != FB_WRITE_LOGICAL_SRC_OMASK) { 673 inst->src[i] = val; 674 progress = true; 675 } 676 break; 677 678 case SHADER_OPCODE_TEX_LOGICAL: 679 case SHADER_OPCODE_TXD_LOGICAL: 680 case SHADER_OPCODE_TXF_LOGICAL: 681 case SHADER_OPCODE_TXL_LOGICAL: 682 case SHADER_OPCODE_TXS_LOGICAL: 683 case FS_OPCODE_TXB_LOGICAL: 684 case SHADER_OPCODE_TXF_CMS_LOGICAL: 685 case SHADER_OPCODE_TXF_CMS_W_LOGICAL: 686 case SHADER_OPCODE_TXF_UMS_LOGICAL: 687 case SHADER_OPCODE_TXF_MCS_LOGICAL: 688 case SHADER_OPCODE_LOD_LOGICAL: 689 case SHADER_OPCODE_TG4_LOGICAL: 690 case SHADER_OPCODE_TG4_OFFSET_LOGICAL: 691 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 692 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 693 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 694 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 695 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 696 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 697 inst->src[i] = val; 698 progress = true; 699 break; 700 701 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 702 case SHADER_OPCODE_BROADCAST: 703 inst->src[i] = val; 704 progress = true; 705 break; 706 707 case BRW_OPCODE_MAD: 708 case BRW_OPCODE_LRP: 709 inst->src[i] = val; 710 progress = true; 711 break; 712 713 default: 714 break; 715 } 716 } 717 718 return progress; 719 } 720 721 static bool 722 can_propagate_from(fs_inst *inst) 723 { 724 return (inst->opcode == BRW_OPCODE_MOV && 725 inst->dst.file == VGRF && 726 ((inst->src[0].file == VGRF && 727 !regions_overlap(inst->dst, inst->size_written, 728 inst->src[0], inst->size_read(0))) || 729 inst->src[0].file == ATTR || 730 inst->src[0].file == UNIFORM || 731 inst->src[0].file == IMM) && 732 inst->src[0].type == inst->dst.type && 733 !inst->is_partial_write()); 734 } 735 736 /* Walks a basic block and does copy propagation on it using the acp 737 * list. 738 */ 739 bool 740 fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, 741 exec_list *acp) 742 { 743 bool progress = false; 744 745 foreach_inst_in_block(fs_inst, inst, block) { 746 /* Try propagating into this instruction. */ 747 for (int i = 0; i < inst->sources; i++) { 748 if (inst->src[i].file != VGRF) 749 continue; 750 751 foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) { 752 if (try_constant_propagate(inst, entry)) 753 progress = true; 754 else if (try_copy_propagate(inst, i, entry)) 755 progress = true; 756 } 757 } 758 759 /* kill the destination from the ACP */ 760 if (inst->dst.file == VGRF) { 761 foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { 762 if (regions_overlap(entry->dst, entry->size_written, 763 inst->dst, inst->size_written)) 764 entry->remove(); 765 } 766 767 /* Oops, we only have the chaining hash based on the destination, not 768 * the source, so walk across the entire table. 769 */ 770 for (int i = 0; i < ACP_HASH_SIZE; i++) { 771 foreach_in_list_safe(acp_entry, entry, &acp[i]) { 772 /* Make sure we kill the entry if this instruction overwrites 773 * _any_ of the registers that it reads 774 */ 775 if (regions_overlap(entry->src, entry->size_read, 776 inst->dst, inst->size_written)) 777 entry->remove(); 778 } 779 } 780 } 781 782 /* If this instruction's source could potentially be folded into the 783 * operand of another instruction, add it to the ACP. 784 */ 785 if (can_propagate_from(inst)) { 786 acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); 787 entry->dst = inst->dst; 788 entry->src = inst->src[0]; 789 entry->size_written = inst->size_written; 790 entry->size_read = inst->size_read(0); 791 entry->opcode = inst->opcode; 792 entry->saturate = inst->saturate; 793 acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); 794 } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD && 795 inst->dst.file == VGRF) { 796 int offset = 0; 797 for (int i = 0; i < inst->sources; i++) { 798 int effective_width = i < inst->header_size ? 8 : inst->exec_size; 799 assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0); 800 const unsigned size_written = effective_width * 801 type_sz(inst->src[i].type); 802 if (inst->src[i].file == VGRF) { 803 acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry); 804 entry->dst = byte_offset(inst->dst, offset); 805 entry->src = inst->src[i]; 806 entry->size_written = size_written; 807 entry->size_read = inst->size_read(i); 808 entry->opcode = inst->opcode; 809 if (!entry->dst.equals(inst->src[i])) { 810 acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); 811 } else { 812 ralloc_free(entry); 813 } 814 } 815 offset += size_written; 816 } 817 } 818 } 819 820 return progress; 821 } 822 823 bool 824 fs_visitor::opt_copy_propagation() 825 { 826 bool progress = false; 827 void *copy_prop_ctx = ralloc_context(NULL); 828 exec_list *out_acp[cfg->num_blocks]; 829 830 for (int i = 0; i < cfg->num_blocks; i++) 831 out_acp[i] = new exec_list [ACP_HASH_SIZE]; 832 833 /* First, walk through each block doing local copy propagation and getting 834 * the set of copies available at the end of the block. 835 */ 836 foreach_block (block, cfg) { 837 progress = opt_copy_propagation_local(copy_prop_ctx, block, 838 out_acp[block->num]) || progress; 839 } 840 841 /* Do dataflow analysis for those available copies. */ 842 fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp); 843 844 /* Next, re-run local copy propagation, this time with the set of copies 845 * provided by the dataflow analysis available at the start of a block. 846 */ 847 foreach_block (block, cfg) { 848 exec_list in_acp[ACP_HASH_SIZE]; 849 850 for (int i = 0; i < dataflow.num_acp; i++) { 851 if (BITSET_TEST(dataflow.bd[block->num].livein, i)) { 852 struct acp_entry *entry = dataflow.acp[i]; 853 in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); 854 } 855 } 856 857 progress = opt_copy_propagation_local(copy_prop_ctx, block, in_acp) || 858 progress; 859 } 860 861 for (int i = 0; i < cfg->num_blocks; i++) 862 delete [] out_acp[i]; 863 ralloc_free(copy_prop_ctx); 864 865 if (progress) 866 invalidate_live_intervals(); 867 868 return progress; 869 } 870