1 /* -*- c++ -*- */ 2 /* 3 * Copyright 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #ifndef BRW_FS_BUILDER_H 26 #define BRW_FS_BUILDER_H 27 28 #include "brw_ir_fs.h" 29 #include "brw_shader.h" 30 #include "brw_context.h" 31 32 namespace brw { 33 /** 34 * Toolbox to assemble an FS IR program out of individual instructions. 35 * 36 * This object is meant to have an interface consistent with 37 * brw::vec4_builder. They cannot be fully interchangeable because 38 * brw::fs_builder generates scalar code while brw::vec4_builder generates 39 * vector code. 40 */ 41 class fs_builder { 42 public: 43 /** Type used in this IR to represent a source of an instruction. */ 44 typedef fs_reg src_reg; 45 46 /** Type used in this IR to represent the destination of an instruction. */ 47 typedef fs_reg dst_reg; 48 49 /** Type used in this IR to represent an instruction. */ 50 typedef fs_inst instruction; 51 52 /** 53 * Construct an fs_builder that inserts instructions into \p shader. 54 * \p dispatch_width gives the native execution width of the program. 55 */ 56 fs_builder(backend_shader *shader, 57 unsigned dispatch_width) : 58 shader(shader), block(NULL), cursor(NULL), 59 _dispatch_width(dispatch_width), 60 _group(0), 61 force_writemask_all(false), 62 annotation() 63 { 64 } 65 66 /** 67 * Construct an fs_builder that inserts instructions into \p shader 68 * before instruction \p inst in basic block \p block. The default 69 * execution controls and debug annotation are initialized from the 70 * instruction passed as argument. 71 */ 72 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : 73 shader(shader), block(block), cursor(inst), 74 _dispatch_width(inst->exec_size), 75 _group(inst->group), 76 force_writemask_all(inst->force_writemask_all) 77 { 78 annotation.str = inst->annotation; 79 annotation.ir = inst->ir; 80 } 81 82 /** 83 * Construct an fs_builder that inserts instructions before \p cursor in 84 * basic block \p block, inheriting other code generation parameters 85 * from this. 86 */ 87 fs_builder 88 at(bblock_t *block, exec_node *cursor) const 89 { 90 fs_builder bld = *this; 91 bld.block = block; 92 bld.cursor = cursor; 93 return bld; 94 } 95 96 /** 97 * Construct an fs_builder appending instructions at the end of the 98 * instruction list of the shader, inheriting other code generation 99 * parameters from this. 100 */ 101 fs_builder 102 at_end() const 103 { 104 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 105 } 106 107 /** 108 * Construct a builder specifying the default SIMD width and group of 109 * channel enable signals, inheriting other code generation parameters 110 * from this. 111 * 112 * \p n gives the default SIMD width, \p i gives the slot group used for 113 * predication and control flow masking in multiples of \p n channels. 114 */ 115 fs_builder 116 group(unsigned n, unsigned i) const 117 { 118 assert(force_writemask_all || 119 (n <= dispatch_width() && i < dispatch_width() / n)); 120 fs_builder bld = *this; 121 bld._dispatch_width = n; 122 bld._group += i * n; 123 return bld; 124 } 125 126 /** 127 * Alias for group() with width equal to eight. 128 */ 129 fs_builder 130 half(unsigned i) const 131 { 132 return group(8, i); 133 } 134 135 /** 136 * Construct a builder with per-channel control flow execution masking 137 * disabled if \p b is true. If control flow execution masking is 138 * already disabled this has no effect. 139 */ 140 fs_builder 141 exec_all(bool b = true) const 142 { 143 fs_builder bld = *this; 144 if (b) 145 bld.force_writemask_all = true; 146 return bld; 147 } 148 149 /** 150 * Construct a builder with the given debug annotation info. 151 */ 152 fs_builder 153 annotate(const char *str, const void *ir = NULL) const 154 { 155 fs_builder bld = *this; 156 bld.annotation.str = str; 157 bld.annotation.ir = ir; 158 return bld; 159 } 160 161 /** 162 * Get the SIMD width in use. 163 */ 164 unsigned 165 dispatch_width() const 166 { 167 return _dispatch_width; 168 } 169 170 /** 171 * Get the channel group in use. 172 */ 173 unsigned 174 group() const 175 { 176 return _group; 177 } 178 179 /** 180 * Allocate a virtual register of natural vector size (one for this IR) 181 * and SIMD width. \p n gives the amount of space to allocate in 182 * dispatch_width units (which is just enough space for one logical 183 * component in this IR). 184 */ 185 dst_reg 186 vgrf(enum brw_reg_type type, unsigned n = 1) const 187 { 188 assert(dispatch_width() <= 32); 189 190 if (n > 0) 191 return dst_reg(VGRF, shader->alloc.allocate( 192 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), 193 REG_SIZE)), 194 type); 195 else 196 return retype(null_reg_ud(), type); 197 } 198 199 /** 200 * Create a null register of floating type. 201 */ 202 dst_reg 203 null_reg_f() const 204 { 205 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); 206 } 207 208 dst_reg 209 null_reg_df() const 210 { 211 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); 212 } 213 214 /** 215 * Create a null register of signed integer type. 216 */ 217 dst_reg 218 null_reg_d() const 219 { 220 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 221 } 222 223 /** 224 * Create a null register of unsigned integer type. 225 */ 226 dst_reg 227 null_reg_ud() const 228 { 229 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 230 } 231 232 /** 233 * Get the mask of SIMD channels enabled by dispatch and not yet 234 * disabled by discard. 235 */ 236 src_reg 237 sample_mask_reg() const 238 { 239 assert(shader->stage != MESA_SHADER_FRAGMENT || 240 group() + dispatch_width() <= 16); 241 if (shader->stage != MESA_SHADER_FRAGMENT) { 242 return brw_imm_d(0xffffffff); 243 } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) { 244 return brw_flag_reg(0, 1); 245 } else { 246 return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD); 247 } 248 } 249 250 /** 251 * Insert an instruction into the program. 252 */ 253 instruction * 254 emit(const instruction &inst) const 255 { 256 return emit(new(shader->mem_ctx) instruction(inst)); 257 } 258 259 /** 260 * Create and insert a nullary control instruction into the program. 261 */ 262 instruction * 263 emit(enum opcode opcode) const 264 { 265 return emit(instruction(opcode, dispatch_width())); 266 } 267 268 /** 269 * Create and insert a nullary instruction into the program. 270 */ 271 instruction * 272 emit(enum opcode opcode, const dst_reg &dst) const 273 { 274 return emit(instruction(opcode, dispatch_width(), dst)); 275 } 276 277 /** 278 * Create and insert a unary instruction into the program. 279 */ 280 instruction * 281 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 282 { 283 switch (opcode) { 284 case SHADER_OPCODE_RCP: 285 case SHADER_OPCODE_RSQ: 286 case SHADER_OPCODE_SQRT: 287 case SHADER_OPCODE_EXP2: 288 case SHADER_OPCODE_LOG2: 289 case SHADER_OPCODE_SIN: 290 case SHADER_OPCODE_COS: 291 return emit(instruction(opcode, dispatch_width(), dst, 292 fix_math_operand(src0))); 293 294 default: 295 return emit(instruction(opcode, dispatch_width(), dst, src0)); 296 } 297 } 298 299 /** 300 * Create and insert a binary instruction into the program. 301 */ 302 instruction * 303 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 304 const src_reg &src1) const 305 { 306 switch (opcode) { 307 case SHADER_OPCODE_POW: 308 case SHADER_OPCODE_INT_QUOTIENT: 309 case SHADER_OPCODE_INT_REMAINDER: 310 return emit(instruction(opcode, dispatch_width(), dst, 311 fix_math_operand(src0), 312 fix_math_operand(src1))); 313 314 default: 315 return emit(instruction(opcode, dispatch_width(), dst, src0, src1)); 316 317 } 318 } 319 320 /** 321 * Create and insert a ternary instruction into the program. 322 */ 323 instruction * 324 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 325 const src_reg &src1, const src_reg &src2) const 326 { 327 switch (opcode) { 328 case BRW_OPCODE_BFE: 329 case BRW_OPCODE_BFI2: 330 case BRW_OPCODE_MAD: 331 case BRW_OPCODE_LRP: 332 return emit(instruction(opcode, dispatch_width(), dst, 333 fix_3src_operand(src0), 334 fix_3src_operand(src1), 335 fix_3src_operand(src2))); 336 337 default: 338 return emit(instruction(opcode, dispatch_width(), dst, 339 src0, src1, src2)); 340 } 341 } 342 343 /** 344 * Create and insert an instruction with a variable number of sources 345 * into the program. 346 */ 347 instruction * 348 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], 349 unsigned n) const 350 { 351 return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); 352 } 353 354 /** 355 * Insert a preallocated instruction into the program. 356 */ 357 instruction * 358 emit(instruction *inst) const 359 { 360 assert(inst->exec_size <= 32); 361 assert(inst->exec_size == dispatch_width() || 362 force_writemask_all); 363 364 inst->group = _group; 365 inst->force_writemask_all = force_writemask_all; 366 inst->annotation = annotation.str; 367 inst->ir = annotation.ir; 368 369 if (block) 370 static_cast<instruction *>(cursor)->insert_before(block, inst); 371 else 372 cursor->insert_before(inst); 373 374 return inst; 375 } 376 377 /** 378 * Select \p src0 if the comparison of both sources with the given 379 * conditional mod evaluates to true, otherwise select \p src1. 380 * 381 * Generally useful to get the minimum or maximum of two values. 382 */ 383 instruction * 384 emit_minmax(const dst_reg &dst, const src_reg &src0, 385 const src_reg &src1, brw_conditional_mod mod) const 386 { 387 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 388 389 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 390 fix_unsigned_negate(src1))); 391 } 392 393 /** 394 * Copy any live channel from \p src to the first channel of the result. 395 */ 396 src_reg 397 emit_uniformize(const src_reg &src) const 398 { 399 /* FIXME: We use a vector chan_index and dst to allow constant and 400 * copy propagration to move result all the way into the consuming 401 * instruction (typically a surface index or sampler index for a 402 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide 403 * dispatch. Once we teach const/copy propagation about scalars we 404 * should go back to scalar destinations here. 405 */ 406 const fs_builder ubld = exec_all(); 407 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); 408 const dst_reg dst = vgrf(src.type); 409 410 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 411 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); 412 413 return src_reg(component(dst, 0)); 414 } 415 416 /** 417 * Assorted arithmetic ops. 418 * @{ 419 */ 420 #define ALU1(op) \ 421 instruction * \ 422 op(const dst_reg &dst, const src_reg &src0) const \ 423 { \ 424 return emit(BRW_OPCODE_##op, dst, src0); \ 425 } 426 427 #define ALU2(op) \ 428 instruction * \ 429 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 430 { \ 431 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 432 } 433 434 #define ALU2_ACC(op) \ 435 instruction * \ 436 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 437 { \ 438 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 439 inst->writes_accumulator = true; \ 440 return inst; \ 441 } 442 443 #define ALU3(op) \ 444 instruction * \ 445 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 446 const src_reg &src2) const \ 447 { \ 448 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 449 } 450 451 ALU2(ADD) 452 ALU2_ACC(ADDC) 453 ALU2(AND) 454 ALU2(ASR) 455 ALU2(AVG) 456 ALU3(BFE) 457 ALU2(BFI1) 458 ALU3(BFI2) 459 ALU1(BFREV) 460 ALU1(CBIT) 461 ALU2(CMPN) 462 ALU3(CSEL) 463 ALU1(DIM) 464 ALU2(DP2) 465 ALU2(DP3) 466 ALU2(DP4) 467 ALU2(DPH) 468 ALU1(F16TO32) 469 ALU1(F32TO16) 470 ALU1(FBH) 471 ALU1(FBL) 472 ALU1(FRC) 473 ALU2(LINE) 474 ALU1(LZD) 475 ALU2(MAC) 476 ALU2_ACC(MACH) 477 ALU3(MAD) 478 ALU1(MOV) 479 ALU2(MUL) 480 ALU1(NOT) 481 ALU2(OR) 482 ALU2(PLN) 483 ALU1(RNDD) 484 ALU1(RNDE) 485 ALU1(RNDU) 486 ALU1(RNDZ) 487 ALU2(SAD2) 488 ALU2_ACC(SADA2) 489 ALU2(SEL) 490 ALU2(SHL) 491 ALU2(SHR) 492 ALU2_ACC(SUBB) 493 ALU2(XOR) 494 495 #undef ALU3 496 #undef ALU2_ACC 497 #undef ALU2 498 #undef ALU1 499 /** @} */ 500 501 /** 502 * CMP: Sets the low bit of the destination channels with the result 503 * of the comparison, while the upper bits are undefined, and updates 504 * the flag register with the packed 16 bits of the result. 505 */ 506 instruction * 507 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 508 brw_conditional_mod condition) const 509 { 510 /* Take the instruction: 511 * 512 * CMP null<d> src0<f> src1<f> 513 * 514 * Original gen4 does type conversion to the destination type 515 * before comparison, producing garbage results for floating 516 * point comparisons. 517 * 518 * The destination type doesn't matter on newer generations, 519 * so we set the type to match src0 so we can compact the 520 * instruction. 521 */ 522 return set_condmod(condition, 523 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 524 fix_unsigned_negate(src0), 525 fix_unsigned_negate(src1))); 526 } 527 528 /** 529 * Gen4 predicated IF. 530 */ 531 instruction * 532 IF(brw_predicate predicate) const 533 { 534 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 535 } 536 537 /** 538 * Emit a linear interpolation instruction. 539 */ 540 instruction * 541 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 542 const src_reg &a) const 543 { 544 if (shader->devinfo->gen >= 6) { 545 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 546 * we need to reorder the operands. 547 */ 548 return emit(BRW_OPCODE_LRP, dst, a, y, x); 549 550 } else { 551 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 552 const dst_reg y_times_a = vgrf(dst.type); 553 const dst_reg one_minus_a = vgrf(dst.type); 554 const dst_reg x_times_one_minus_a = vgrf(dst.type); 555 556 MUL(y_times_a, y, a); 557 ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 558 MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 559 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 560 } 561 } 562 563 /** 564 * Collect a number of registers in a contiguous range of registers. 565 */ 566 instruction * 567 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, 568 unsigned sources, unsigned header_size) const 569 { 570 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); 571 inst->header_size = header_size; 572 inst->size_written = header_size * REG_SIZE; 573 for (unsigned i = header_size; i < sources; i++) { 574 inst->size_written += 575 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, 576 REG_SIZE); 577 } 578 579 return inst; 580 } 581 582 backend_shader *shader; 583 584 private: 585 /** 586 * Workaround for negation of UD registers. See comment in 587 * fs_generator::generate_code() for more details. 588 */ 589 src_reg 590 fix_unsigned_negate(const src_reg &src) const 591 { 592 if (src.type == BRW_REGISTER_TYPE_UD && 593 src.negate) { 594 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 595 MOV(temp, src); 596 return src_reg(temp); 597 } else { 598 return src; 599 } 600 } 601 602 /** 603 * Workaround for source register modes not supported by the ternary 604 * instruction encoding. 605 */ 606 src_reg 607 fix_3src_operand(const src_reg &src) const 608 { 609 if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) { 610 return src; 611 } else { 612 dst_reg expanded = vgrf(src.type); 613 MOV(expanded, src); 614 return expanded; 615 } 616 } 617 618 /** 619 * Workaround for source register modes not supported by the math 620 * instruction. 621 */ 622 src_reg 623 fix_math_operand(const src_reg &src) const 624 { 625 /* Can't do hstride == 0 args on gen6 math, so expand it out. We 626 * might be able to do better by doing execsize = 1 math and then 627 * expanding that result out, but we would need to be careful with 628 * masking. 629 * 630 * Gen6 hardware ignores source modifiers (negate and abs) on math 631 * instructions, so we also move to a temp to set those up. 632 * 633 * Gen7 relaxes most of the above restrictions, but still can't use IMM 634 * operands to math 635 */ 636 if ((shader->devinfo->gen == 6 && 637 (src.file == IMM || src.file == UNIFORM || 638 src.abs || src.negate)) || 639 (shader->devinfo->gen == 7 && src.file == IMM)) { 640 const dst_reg tmp = vgrf(src.type); 641 MOV(tmp, src); 642 return tmp; 643 } else { 644 return src; 645 } 646 } 647 648 bblock_t *block; 649 exec_node *cursor; 650 651 unsigned _dispatch_width; 652 unsigned _group; 653 bool force_writemask_all; 654 655 /** Debug annotation info. */ 656 struct { 657 const char *str; 658 const void *ir; 659 } annotation; 660 }; 661 } 662 663 #endif 664