1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ 2 3 /* 4 * Copyright (C) 2014 Rob Clark <robclark (at) freedesktop.org> 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 * Authors: 26 * Rob Clark <robclark (at) freedesktop.org> 27 */ 28 29 #include "freedreno_util.h" 30 31 #include "ir3.h" 32 #include "ir3_shader.h" 33 34 /* 35 * Copy Propagate: 36 */ 37 38 struct ir3_cp_ctx { 39 struct ir3 *shader; 40 struct ir3_shader_variant *so; 41 unsigned immediate_idx; 42 }; 43 44 /* is it a type preserving mov, with ok flags? */ 45 static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags) 46 { 47 if (is_same_type_mov(instr)) { 48 struct ir3_register *dst = instr->regs[0]; 49 struct ir3_register *src = instr->regs[1]; 50 struct ir3_instruction *src_instr = ssa(src); 51 52 /* only if mov src is SSA (not const/immed): */ 53 if (!src_instr) 54 return false; 55 56 /* no indirect: */ 57 if (dst->flags & IR3_REG_RELATIV) 58 return false; 59 if (src->flags & IR3_REG_RELATIV) 60 return false; 61 62 if (!allow_flags) 63 if (src->flags & (IR3_REG_FABS | IR3_REG_FNEG | 64 IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) 65 return false; 66 67 /* TODO: remove this hack: */ 68 if (src_instr->opc == OPC_META_FO) 69 return false; 70 /* TODO: we currently don't handle left/right neighbors 71 * very well when inserting parallel-copies into phi.. 72 * to avoid problems don't eliminate a mov coming out 73 * of phi.. 74 */ 75 if (src_instr->opc == OPC_META_PHI) 76 return false; 77 return true; 78 } 79 return false; 80 } 81 82 static unsigned cp_flags(unsigned flags) 83 { 84 /* only considering these flags (at least for now): */ 85 flags &= (IR3_REG_CONST | IR3_REG_IMMED | 86 IR3_REG_FNEG | IR3_REG_FABS | 87 IR3_REG_SNEG | IR3_REG_SABS | 88 IR3_REG_BNOT | IR3_REG_RELATIV); 89 return flags; 90 } 91 92 static bool valid_flags(struct ir3_instruction *instr, unsigned n, 93 unsigned flags) 94 { 95 unsigned valid_flags; 96 flags = cp_flags(flags); 97 98 /* If destination is indirect, then source cannot be.. at least 99 * I don't think so.. 100 */ 101 if ((instr->regs[0]->flags & IR3_REG_RELATIV) && 102 (flags & IR3_REG_RELATIV)) 103 return false; 104 105 /* TODO it seems to *mostly* work to cp RELATIV, except we get some 106 * intermittent piglit variable-indexing fails. Newer blob driver 107 * doesn't seem to cp these. Possibly this is hw workaround? Not 108 * sure, but until that is understood better, lets just switch off 109 * cp for indirect src's: 110 */ 111 if (flags & IR3_REG_RELATIV) 112 return false; 113 114 switch (opc_cat(instr->opc)) { 115 case 1: 116 valid_flags = IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV; 117 if (flags & ~valid_flags) 118 return false; 119 break; 120 case 2: 121 valid_flags = ir3_cat2_absneg(instr->opc) | 122 IR3_REG_CONST | IR3_REG_RELATIV; 123 124 if (ir3_cat2_int(instr->opc)) 125 valid_flags |= IR3_REG_IMMED; 126 127 if (flags & ~valid_flags) 128 return false; 129 130 if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) { 131 unsigned m = (n ^ 1) + 1; 132 /* cannot deal w/ const in both srcs: 133 * (note that some cat2 actually only have a single src) 134 */ 135 if (m < instr->regs_count) { 136 struct ir3_register *reg = instr->regs[m]; 137 if ((flags & IR3_REG_CONST) && (reg->flags & IR3_REG_CONST)) 138 return false; 139 if ((flags & IR3_REG_IMMED) && (reg->flags & IR3_REG_IMMED)) 140 return false; 141 } 142 /* cannot be const + ABS|NEG: */ 143 if (flags & (IR3_REG_FABS | IR3_REG_FNEG | 144 IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) 145 return false; 146 } 147 break; 148 case 3: 149 valid_flags = ir3_cat3_absneg(instr->opc) | 150 IR3_REG_CONST | IR3_REG_RELATIV; 151 152 if (flags & ~valid_flags) 153 return false; 154 155 if (flags & (IR3_REG_CONST | IR3_REG_RELATIV)) { 156 /* cannot deal w/ const/relativ in 2nd src: */ 157 if (n == 1) 158 return false; 159 } 160 161 if (flags & IR3_REG_CONST) { 162 /* cannot be const + ABS|NEG: */ 163 if (flags & (IR3_REG_FABS | IR3_REG_FNEG | 164 IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT)) 165 return false; 166 } 167 break; 168 case 4: 169 /* seems like blob compiler avoids const as src.. */ 170 /* TODO double check if this is still the case on a4xx */ 171 if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) 172 return false; 173 if (flags & (IR3_REG_SABS | IR3_REG_SNEG)) 174 return false; 175 break; 176 case 5: 177 /* no flags allowed */ 178 if (flags) 179 return false; 180 break; 181 case 6: 182 valid_flags = IR3_REG_IMMED; 183 if (flags & ~valid_flags) 184 return false; 185 186 if (flags & IR3_REG_IMMED) { 187 /* doesn't seem like we can have immediate src for store 188 * instructions: 189 * 190 * TODO this restriction could also apply to load instructions, 191 * but for load instructions this arg is the address (and not 192 * really sure any good way to test a hard-coded immed addr src) 193 */ 194 if (is_store(instr) && (n == 1)) 195 return false; 196 197 if ((instr->opc == OPC_LDL) && (n != 1)) 198 return false; 199 200 if ((instr->opc == OPC_STL) && (n != 2)) 201 return false; 202 203 /* disallow CP into anything but the SSBO slot argument for 204 * atomics: 205 */ 206 if (is_atomic(instr->opc) && (n != 0)) 207 return false; 208 209 if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G)) 210 return false; 211 } 212 213 break; 214 } 215 216 return true; 217 } 218 219 /* propagate register flags from src to dst.. negates need special 220 * handling to cancel each other out. 221 */ 222 static void combine_flags(unsigned *dstflags, struct ir3_instruction *src) 223 { 224 unsigned srcflags = src->regs[1]->flags; 225 226 /* if what we are combining into already has (abs) flags, 227 * we can drop (neg) from src: 228 */ 229 if (*dstflags & IR3_REG_FABS) 230 srcflags &= ~IR3_REG_FNEG; 231 if (*dstflags & IR3_REG_SABS) 232 srcflags &= ~IR3_REG_SNEG; 233 234 if (srcflags & IR3_REG_FABS) 235 *dstflags |= IR3_REG_FABS; 236 if (srcflags & IR3_REG_SABS) 237 *dstflags |= IR3_REG_SABS; 238 if (srcflags & IR3_REG_FNEG) 239 *dstflags ^= IR3_REG_FNEG; 240 if (srcflags & IR3_REG_SNEG) 241 *dstflags ^= IR3_REG_SNEG; 242 if (srcflags & IR3_REG_BNOT) 243 *dstflags ^= IR3_REG_BNOT; 244 245 *dstflags &= ~IR3_REG_SSA; 246 *dstflags |= srcflags & IR3_REG_SSA; 247 *dstflags |= srcflags & IR3_REG_CONST; 248 *dstflags |= srcflags & IR3_REG_IMMED; 249 *dstflags |= srcflags & IR3_REG_RELATIV; 250 *dstflags |= srcflags & IR3_REG_ARRAY; 251 252 /* if src of the src is boolean we can drop the (abs) since we know 253 * the source value is already a postitive integer. This cleans 254 * up the absnegs that get inserted when converting between nir and 255 * native boolean (see ir3_b2n/n2b) 256 */ 257 struct ir3_instruction *srcsrc = ssa(src->regs[1]); 258 if (srcsrc && is_bool(srcsrc)) 259 *dstflags &= ~IR3_REG_SABS; 260 } 261 262 static struct ir3_register * 263 lower_immed(struct ir3_cp_ctx *ctx, struct ir3_register *reg, unsigned new_flags) 264 { 265 unsigned swiz, idx, i; 266 267 reg = ir3_reg_clone(ctx->shader, reg); 268 269 /* in some cases, there are restrictions on (abs)/(neg) plus const.. 270 * so just evaluate those and clear the flags: 271 */ 272 if (new_flags & IR3_REG_SABS) { 273 reg->iim_val = abs(reg->iim_val); 274 new_flags &= ~IR3_REG_SABS; 275 } 276 277 if (new_flags & IR3_REG_FABS) { 278 reg->fim_val = fabs(reg->fim_val); 279 new_flags &= ~IR3_REG_FABS; 280 } 281 282 if (new_flags & IR3_REG_SNEG) { 283 reg->iim_val = -reg->iim_val; 284 new_flags &= ~IR3_REG_SNEG; 285 } 286 287 if (new_flags & IR3_REG_FNEG) { 288 reg->fim_val = -reg->fim_val; 289 new_flags &= ~IR3_REG_FNEG; 290 } 291 292 for (i = 0; i < ctx->immediate_idx; i++) { 293 swiz = i % 4; 294 idx = i / 4; 295 296 if (ctx->so->immediates[idx].val[swiz] == reg->uim_val) { 297 break; 298 } 299 } 300 301 if (i == ctx->immediate_idx) { 302 /* need to generate a new immediate: */ 303 swiz = i % 4; 304 idx = i / 4; 305 ctx->so->immediates[idx].val[swiz] = reg->uim_val; 306 ctx->so->immediates_count = idx + 1; 307 ctx->immediate_idx++; 308 } 309 310 new_flags &= ~IR3_REG_IMMED; 311 new_flags |= IR3_REG_CONST; 312 reg->flags = new_flags; 313 reg->num = i + (4 * ctx->so->constbase.immediate); 314 315 return reg; 316 } 317 318 /** 319 * Handle cp for a given src register. This additionally handles 320 * the cases of collapsing immedate/const (which replace the src 321 * register with a non-ssa src) or collapsing mov's from relative 322 * src (which needs to also fixup the address src reference by the 323 * instruction). 324 */ 325 static void 326 reg_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, 327 struct ir3_register *reg, unsigned n) 328 { 329 struct ir3_instruction *src = ssa(reg); 330 331 /* don't propagate copies into a PHI, since we don't know if the 332 * src block executed: 333 */ 334 if (instr->opc == OPC_META_PHI) 335 return; 336 337 if (is_eligible_mov(src, true)) { 338 /* simple case, no immed/const/relativ, only mov's w/ ssa src: */ 339 struct ir3_register *src_reg = src->regs[1]; 340 unsigned new_flags = reg->flags; 341 342 combine_flags(&new_flags, src); 343 344 if (valid_flags(instr, n, new_flags)) { 345 if (new_flags & IR3_REG_ARRAY) { 346 debug_assert(!(reg->flags & IR3_REG_ARRAY)); 347 reg->array = src_reg->array; 348 } 349 reg->flags = new_flags; 350 reg->instr = ssa(src_reg); 351 } 352 353 src = ssa(reg); /* could be null for IR3_REG_ARRAY case */ 354 if (!src) 355 return; 356 } else if (is_same_type_mov(src) && 357 /* cannot collapse const/immed/etc into meta instrs: */ 358 !is_meta(instr)) { 359 /* immed/const/etc cases, which require some special handling: */ 360 struct ir3_register *src_reg = src->regs[1]; 361 unsigned new_flags = reg->flags; 362 363 combine_flags(&new_flags, src); 364 365 if (!valid_flags(instr, n, new_flags)) { 366 /* See if lowering an immediate to const would help. */ 367 if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) { 368 debug_assert(new_flags & IR3_REG_IMMED); 369 instr->regs[n + 1] = lower_immed(ctx, src_reg, new_flags); 370 return; 371 } 372 373 /* special case for "normal" mad instructions, we can 374 * try swapping the first two args if that fits better. 375 * 376 * the "plain" MAD's (ie. the ones that don't shift first 377 * src prior to multiply) can swap their first two srcs if 378 * src[0] is !CONST and src[1] is CONST: 379 */ 380 if ((n == 1) && is_mad(instr->opc) && 381 !(instr->regs[0 + 1]->flags & (IR3_REG_CONST | IR3_REG_RELATIV)) && 382 valid_flags(instr, 0, new_flags)) { 383 /* swap src[0] and src[1]: */ 384 struct ir3_register *tmp; 385 tmp = instr->regs[0 + 1]; 386 instr->regs[0 + 1] = instr->regs[1 + 1]; 387 instr->regs[1 + 1] = tmp; 388 n = 0; 389 } else { 390 return; 391 } 392 } 393 394 /* Here we handle the special case of mov from 395 * CONST and/or RELATIV. These need to be handled 396 * specially, because in the case of move from CONST 397 * there is no src ir3_instruction so we need to 398 * replace the ir3_register. And in the case of 399 * RELATIV we need to handle the address register 400 * dependency. 401 */ 402 if (src_reg->flags & IR3_REG_CONST) { 403 /* an instruction cannot reference two different 404 * address registers: 405 */ 406 if ((src_reg->flags & IR3_REG_RELATIV) && 407 conflicts(instr->address, reg->instr->address)) 408 return; 409 410 /* This seems to be a hw bug, or something where the timings 411 * just somehow don't work out. This restriction may only 412 * apply if the first src is also CONST. 413 */ 414 if ((opc_cat(instr->opc) == 3) && (n == 2) && 415 (src_reg->flags & IR3_REG_RELATIV) && 416 (src_reg->array.offset == 0)) 417 return; 418 419 src_reg = ir3_reg_clone(instr->block->shader, src_reg); 420 src_reg->flags = new_flags; 421 instr->regs[n+1] = src_reg; 422 423 if (src_reg->flags & IR3_REG_RELATIV) 424 ir3_instr_set_address(instr, reg->instr->address); 425 426 return; 427 } 428 429 if ((src_reg->flags & IR3_REG_RELATIV) && 430 !conflicts(instr->address, reg->instr->address)) { 431 src_reg = ir3_reg_clone(instr->block->shader, src_reg); 432 src_reg->flags = new_flags; 433 instr->regs[n+1] = src_reg; 434 ir3_instr_set_address(instr, reg->instr->address); 435 436 return; 437 } 438 439 /* NOTE: seems we can only do immed integers, so don't 440 * need to care about float. But we do need to handle 441 * abs/neg *before* checking that the immediate requires 442 * few enough bits to encode: 443 * 444 * TODO: do we need to do something to avoid accidentally 445 * catching a float immed? 446 */ 447 if (src_reg->flags & IR3_REG_IMMED) { 448 int32_t iim_val = src_reg->iim_val; 449 450 debug_assert((opc_cat(instr->opc) == 1) || 451 (opc_cat(instr->opc) == 6) || 452 ir3_cat2_int(instr->opc)); 453 454 if (new_flags & IR3_REG_SABS) 455 iim_val = abs(iim_val); 456 457 if (new_flags & IR3_REG_SNEG) 458 iim_val = -iim_val; 459 460 if (new_flags & IR3_REG_BNOT) 461 iim_val = ~iim_val; 462 463 /* other than category 1 (mov) we can only encode up to 10 bits: */ 464 if ((instr->opc == OPC_MOV) || 465 !((iim_val & ~0x3ff) && (-iim_val & ~0x3ff))) { 466 new_flags &= ~(IR3_REG_SABS | IR3_REG_SNEG | IR3_REG_BNOT); 467 src_reg = ir3_reg_clone(instr->block->shader, src_reg); 468 src_reg->flags = new_flags; 469 src_reg->iim_val = iim_val; 470 instr->regs[n+1] = src_reg; 471 } else if (valid_flags(instr, n, (new_flags & ~IR3_REG_IMMED) | IR3_REG_CONST)) { 472 /* See if lowering an immediate to const would help. */ 473 instr->regs[n+1] = lower_immed(ctx, src_reg, new_flags); 474 } 475 476 return; 477 } 478 } 479 } 480 481 /* Handle special case of eliminating output mov, and similar cases where 482 * there isn't a normal "consuming" instruction. In this case we cannot 483 * collapse flags (ie. output mov from const, or w/ abs/neg flags, cannot 484 * be eliminated) 485 */ 486 static struct ir3_instruction * 487 eliminate_output_mov(struct ir3_instruction *instr) 488 { 489 if (is_eligible_mov(instr, false)) { 490 struct ir3_register *reg = instr->regs[1]; 491 if (!(reg->flags & IR3_REG_ARRAY)) { 492 struct ir3_instruction *src_instr = ssa(reg); 493 debug_assert(src_instr); 494 return src_instr; 495 } 496 } 497 return instr; 498 } 499 500 /** 501 * Find instruction src's which are mov's that can be collapsed, replacing 502 * the mov dst with the mov src 503 */ 504 static void 505 instr_cp(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr) 506 { 507 struct ir3_register *reg; 508 509 if (instr->regs_count == 0) 510 return; 511 512 if (ir3_instr_check_mark(instr)) 513 return; 514 515 /* walk down the graph from each src: */ 516 foreach_src_n(reg, n, instr) { 517 struct ir3_instruction *src = ssa(reg); 518 519 if (!src) 520 continue; 521 522 instr_cp(ctx, src); 523 524 /* TODO non-indirect access we could figure out which register 525 * we actually want and allow cp.. 526 */ 527 if (reg->flags & IR3_REG_ARRAY) 528 continue; 529 530 reg_cp(ctx, instr, reg, n); 531 } 532 533 if (instr->regs[0]->flags & IR3_REG_ARRAY) { 534 struct ir3_instruction *src = ssa(instr->regs[0]); 535 if (src) 536 instr_cp(ctx, src); 537 } 538 539 if (instr->address) { 540 instr_cp(ctx, instr->address); 541 ir3_instr_set_address(instr, eliminate_output_mov(instr->address)); 542 } 543 544 /* we can end up with extra cmps.s from frontend, which uses a 545 * 546 * cmps.s p0.x, cond, 0 547 * 548 * as a way to mov into the predicate register. But frequently 'cond' 549 * is itself a cmps.s/cmps.f/cmps.u. So detect this special case and 550 * just re-write the instruction writing predicate register to get rid 551 * of the double cmps. 552 */ 553 if ((instr->opc == OPC_CMPS_S) && 554 (instr->regs[0]->num == regid(REG_P0, 0)) && 555 ssa(instr->regs[1]) && 556 (instr->regs[2]->flags & IR3_REG_IMMED) && 557 (instr->regs[2]->iim_val == 0)) { 558 struct ir3_instruction *cond = ssa(instr->regs[1]); 559 switch (cond->opc) { 560 case OPC_CMPS_S: 561 case OPC_CMPS_F: 562 case OPC_CMPS_U: 563 instr->opc = cond->opc; 564 instr->flags = cond->flags; 565 instr->cat2 = cond->cat2; 566 instr->address = cond->address; 567 instr->regs[1] = cond->regs[1]; 568 instr->regs[2] = cond->regs[2]; 569 break; 570 default: 571 break; 572 } 573 } 574 } 575 576 void 577 ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so) 578 { 579 struct ir3_cp_ctx ctx = { 580 .shader = ir, 581 .so = so, 582 }; 583 584 ir3_clear_mark(ir); 585 586 for (unsigned i = 0; i < ir->noutputs; i++) { 587 if (ir->outputs[i]) { 588 instr_cp(&ctx, ir->outputs[i]); 589 ir->outputs[i] = eliminate_output_mov(ir->outputs[i]); 590 } 591 } 592 593 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 594 if (block->condition) { 595 instr_cp(&ctx, block->condition); 596 block->condition = eliminate_output_mov(block->condition); 597 } 598 599 for (unsigned i = 0; i < block->keeps_count; i++) { 600 instr_cp(&ctx, block->keeps[i]); 601 block->keeps[i] = eliminate_output_mov(block->keeps[i]); 602 } 603 } 604 } 605