1 /* -*- c++ -*- */ 2 /* 3 * Copyright 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #ifndef BRW_VEC4_BUILDER_H 26 #define BRW_VEC4_BUILDER_H 27 28 #include "brw_ir_vec4.h" 29 #include "brw_ir_allocator.h" 30 #include "brw_context.h" 31 32 namespace brw { 33 /** 34 * Toolbox to assemble a VEC4 IR program out of individual instructions. 35 * 36 * This object is meant to have an interface consistent with 37 * brw::fs_builder. They cannot be fully interchangeable because 38 * brw::fs_builder generates scalar code while brw::vec4_builder generates 39 * vector code. 40 */ 41 class vec4_builder { 42 public: 43 /** Type used in this IR to represent a source of an instruction. */ 44 typedef brw::src_reg src_reg; 45 46 /** Type used in this IR to represent the destination of an instruction. */ 47 typedef brw::dst_reg dst_reg; 48 49 /** Type used in this IR to represent an instruction. */ 50 typedef vec4_instruction instruction; 51 52 /** 53 * Construct a vec4_builder that inserts instructions into \p shader. 54 */ 55 vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) : 56 shader(shader), block(NULL), cursor(NULL), 57 _dispatch_width(dispatch_width), _group(0), 58 force_writemask_all(false), 59 annotation() 60 { 61 } 62 63 /** 64 * Construct a vec4_builder that inserts instructions into \p shader 65 * before instruction \p inst in basic block \p block. The default 66 * execution controls and debug annotation are initialized from the 67 * instruction passed as argument. 68 */ 69 vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) : 70 shader(shader), block(block), cursor(inst), 71 _dispatch_width(inst->exec_size), _group(inst->group), 72 force_writemask_all(inst->force_writemask_all) 73 { 74 annotation.str = inst->annotation; 75 annotation.ir = inst->ir; 76 } 77 78 /** 79 * Construct a vec4_builder that inserts instructions before \p cursor 80 * in basic block \p block, inheriting other code generation parameters 81 * from this. 82 */ 83 vec4_builder 84 at(bblock_t *block, exec_node *cursor) const 85 { 86 vec4_builder bld = *this; 87 bld.block = block; 88 bld.cursor = cursor; 89 return bld; 90 } 91 92 /** 93 * Construct a vec4_builder appending instructions at the end of the 94 * instruction list of the shader, inheriting other code generation 95 * parameters from this. 96 */ 97 vec4_builder 98 at_end() const 99 { 100 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 101 } 102 103 /** 104 * Construct a builder specifying the default SIMD width and group of 105 * channel enable signals, inheriting other code generation parameters 106 * from this. 107 * 108 * \p n gives the default SIMD width, \p i gives the slot group used for 109 * predication and control flow masking in multiples of \p n channels. 110 */ 111 vec4_builder 112 group(unsigned n, unsigned i) const 113 { 114 assert(force_writemask_all || 115 (n <= dispatch_width() && i < dispatch_width() / n)); 116 vec4_builder bld = *this; 117 bld._dispatch_width = n; 118 bld._group += i * n; 119 return bld; 120 } 121 122 /** 123 * Construct a builder with per-channel control flow execution masking 124 * disabled if \p b is true. If control flow execution masking is 125 * already disabled this has no effect. 126 */ 127 vec4_builder 128 exec_all(bool b = true) const 129 { 130 vec4_builder bld = *this; 131 if (b) 132 bld.force_writemask_all = true; 133 return bld; 134 } 135 136 /** 137 * Construct a builder with the given debug annotation info. 138 */ 139 vec4_builder 140 annotate(const char *str, const void *ir = NULL) const 141 { 142 vec4_builder bld = *this; 143 bld.annotation.str = str; 144 bld.annotation.ir = ir; 145 return bld; 146 } 147 148 /** 149 * Get the SIMD width in use. 150 */ 151 unsigned 152 dispatch_width() const 153 { 154 return _dispatch_width; 155 } 156 157 /** 158 * Get the channel group in use. 159 */ 160 unsigned 161 group() const 162 { 163 return _group; 164 } 165 166 /** 167 * Allocate a virtual register of natural vector size (four for this IR) 168 * and SIMD width. \p n gives the amount of space to allocate in 169 * dispatch_width units (which is just enough space for four logical 170 * components in this IR). 171 */ 172 dst_reg 173 vgrf(enum brw_reg_type type, unsigned n = 1) const 174 { 175 assert(dispatch_width() <= 32); 176 177 if (n > 0) 178 return retype(dst_reg(VGRF, shader->alloc.allocate( 179 n * DIV_ROUND_UP(type_sz(type), 4))), 180 type); 181 else 182 return retype(null_reg_ud(), type); 183 } 184 185 /** 186 * Create a null register of floating type. 187 */ 188 dst_reg 189 null_reg_f() const 190 { 191 return dst_reg(retype(brw_null_vec(dispatch_width()), 192 BRW_REGISTER_TYPE_F)); 193 } 194 195 /** 196 * Create a null register of signed integer type. 197 */ 198 dst_reg 199 null_reg_d() const 200 { 201 return dst_reg(retype(brw_null_vec(dispatch_width()), 202 BRW_REGISTER_TYPE_D)); 203 } 204 205 /** 206 * Create a null register of unsigned integer type. 207 */ 208 dst_reg 209 null_reg_ud() const 210 { 211 return dst_reg(retype(brw_null_vec(dispatch_width()), 212 BRW_REGISTER_TYPE_UD)); 213 } 214 215 /** 216 * Insert an instruction into the program. 217 */ 218 instruction * 219 emit(const instruction &inst) const 220 { 221 return emit(new(shader->mem_ctx) instruction(inst)); 222 } 223 224 /** 225 * Create and insert a nullary control instruction into the program. 226 */ 227 instruction * 228 emit(enum opcode opcode) const 229 { 230 return emit(instruction(opcode)); 231 } 232 233 /** 234 * Create and insert a nullary instruction into the program. 235 */ 236 instruction * 237 emit(enum opcode opcode, const dst_reg &dst) const 238 { 239 return emit(instruction(opcode, dst)); 240 } 241 242 /** 243 * Create and insert a unary instruction into the program. 244 */ 245 instruction * 246 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 247 { 248 switch (opcode) { 249 case SHADER_OPCODE_RCP: 250 case SHADER_OPCODE_RSQ: 251 case SHADER_OPCODE_SQRT: 252 case SHADER_OPCODE_EXP2: 253 case SHADER_OPCODE_LOG2: 254 case SHADER_OPCODE_SIN: 255 case SHADER_OPCODE_COS: 256 return fix_math_instruction( 257 emit(instruction(opcode, dst, 258 fix_math_operand(src0)))); 259 260 default: 261 return emit(instruction(opcode, dst, src0)); 262 } 263 } 264 265 /** 266 * Create and insert a binary instruction into the program. 267 */ 268 instruction * 269 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 270 const src_reg &src1) const 271 { 272 switch (opcode) { 273 case SHADER_OPCODE_POW: 274 case SHADER_OPCODE_INT_QUOTIENT: 275 case SHADER_OPCODE_INT_REMAINDER: 276 return fix_math_instruction( 277 emit(instruction(opcode, dst, 278 fix_math_operand(src0), 279 fix_math_operand(src1)))); 280 281 default: 282 return emit(instruction(opcode, dst, src0, src1)); 283 } 284 } 285 286 /** 287 * Create and insert a ternary instruction into the program. 288 */ 289 instruction * 290 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 291 const src_reg &src1, const src_reg &src2) const 292 { 293 switch (opcode) { 294 case BRW_OPCODE_BFE: 295 case BRW_OPCODE_BFI2: 296 case BRW_OPCODE_MAD: 297 case BRW_OPCODE_LRP: 298 return emit(instruction(opcode, dst, 299 fix_3src_operand(src0), 300 fix_3src_operand(src1), 301 fix_3src_operand(src2))); 302 303 default: 304 return emit(instruction(opcode, dst, src0, src1, src2)); 305 } 306 } 307 308 /** 309 * Insert a preallocated instruction into the program. 310 */ 311 instruction * 312 emit(instruction *inst) const 313 { 314 inst->exec_size = dispatch_width(); 315 inst->group = group(); 316 inst->force_writemask_all = force_writemask_all; 317 inst->size_written = inst->exec_size * type_sz(inst->dst.type); 318 inst->annotation = annotation.str; 319 inst->ir = annotation.ir; 320 321 if (block) 322 static_cast<instruction *>(cursor)->insert_before(block, inst); 323 else 324 cursor->insert_before(inst); 325 326 return inst; 327 } 328 329 /** 330 * Select \p src0 if the comparison of both sources with the given 331 * conditional mod evaluates to true, otherwise select \p src1. 332 * 333 * Generally useful to get the minimum or maximum of two values. 334 */ 335 instruction * 336 emit_minmax(const dst_reg &dst, const src_reg &src0, 337 const src_reg &src1, brw_conditional_mod mod) const 338 { 339 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 340 341 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 342 fix_unsigned_negate(src1))); 343 } 344 345 /** 346 * Copy any live channel from \p src to the first channel of the result. 347 */ 348 src_reg 349 emit_uniformize(const src_reg &src) const 350 { 351 const vec4_builder ubld = exec_all(); 352 const dst_reg chan_index = 353 writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X); 354 const dst_reg dst = vgrf(src.type); 355 356 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 357 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index)); 358 359 return src_reg(dst); 360 } 361 362 /** 363 * Assorted arithmetic ops. 364 * @{ 365 */ 366 #define ALU1(op) \ 367 instruction * \ 368 op(const dst_reg &dst, const src_reg &src0) const \ 369 { \ 370 return emit(BRW_OPCODE_##op, dst, src0); \ 371 } 372 373 #define ALU2(op) \ 374 instruction * \ 375 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 376 { \ 377 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 378 } 379 380 #define ALU2_ACC(op) \ 381 instruction * \ 382 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 383 { \ 384 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 385 inst->writes_accumulator = true; \ 386 return inst; \ 387 } 388 389 #define ALU3(op) \ 390 instruction * \ 391 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 392 const src_reg &src2) const \ 393 { \ 394 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 395 } 396 397 ALU2(ADD) 398 ALU2_ACC(ADDC) 399 ALU2(AND) 400 ALU2(ASR) 401 ALU2(AVG) 402 ALU3(BFE) 403 ALU2(BFI1) 404 ALU3(BFI2) 405 ALU1(BFREV) 406 ALU1(CBIT) 407 ALU2(CMPN) 408 ALU3(CSEL) 409 ALU1(DIM) 410 ALU2(DP2) 411 ALU2(DP3) 412 ALU2(DP4) 413 ALU2(DPH) 414 ALU1(F16TO32) 415 ALU1(F32TO16) 416 ALU1(FBH) 417 ALU1(FBL) 418 ALU1(FRC) 419 ALU2(LINE) 420 ALU1(LZD) 421 ALU2(MAC) 422 ALU2_ACC(MACH) 423 ALU3(MAD) 424 ALU1(MOV) 425 ALU2(MUL) 426 ALU1(NOT) 427 ALU2(OR) 428 ALU2(PLN) 429 ALU1(RNDD) 430 ALU1(RNDE) 431 ALU1(RNDU) 432 ALU1(RNDZ) 433 ALU2(SAD2) 434 ALU2_ACC(SADA2) 435 ALU2(SEL) 436 ALU2(SHL) 437 ALU2(SHR) 438 ALU2_ACC(SUBB) 439 ALU2(XOR) 440 441 #undef ALU3 442 #undef ALU2_ACC 443 #undef ALU2 444 #undef ALU1 445 /** @} */ 446 447 /** 448 * CMP: Sets the low bit of the destination channels with the result 449 * of the comparison, while the upper bits are undefined, and updates 450 * the flag register with the packed 16 bits of the result. 451 */ 452 instruction * 453 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 454 brw_conditional_mod condition) const 455 { 456 /* Take the instruction: 457 * 458 * CMP null<d> src0<f> src1<f> 459 * 460 * Original gen4 does type conversion to the destination type 461 * before comparison, producing garbage results for floating 462 * point comparisons. 463 * 464 * The destination type doesn't matter on newer generations, 465 * so we set the type to match src0 so we can compact the 466 * instruction. 467 */ 468 return set_condmod(condition, 469 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 470 fix_unsigned_negate(src0), 471 fix_unsigned_negate(src1))); 472 } 473 474 /** 475 * Gen4 predicated IF. 476 */ 477 instruction * 478 IF(brw_predicate predicate) const 479 { 480 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 481 } 482 483 /** 484 * Gen6 IF with embedded comparison. 485 */ 486 instruction * 487 IF(const src_reg &src0, const src_reg &src1, 488 brw_conditional_mod condition) const 489 { 490 assert(shader->devinfo->gen == 6); 491 return set_condmod(condition, 492 emit(BRW_OPCODE_IF, 493 null_reg_d(), 494 fix_unsigned_negate(src0), 495 fix_unsigned_negate(src1))); 496 } 497 498 /** 499 * Emit a linear interpolation instruction. 500 */ 501 instruction * 502 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 503 const src_reg &a) const 504 { 505 if (shader->devinfo->gen >= 6) { 506 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 507 * we need to reorder the operands. 508 */ 509 return emit(BRW_OPCODE_LRP, dst, a, y, x); 510 511 } else { 512 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 513 const dst_reg y_times_a = vgrf(dst.type); 514 const dst_reg one_minus_a = vgrf(dst.type); 515 const dst_reg x_times_one_minus_a = vgrf(dst.type); 516 517 MUL(y_times_a, y, a); 518 ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 519 MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 520 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 521 } 522 } 523 524 backend_shader *shader; 525 526 protected: 527 /** 528 * Workaround for negation of UD registers. See comment in 529 * fs_generator::generate_code() for the details. 530 */ 531 src_reg 532 fix_unsigned_negate(const src_reg &src) const 533 { 534 if (src.type == BRW_REGISTER_TYPE_UD && src.negate) { 535 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 536 MOV(temp, src); 537 return src_reg(temp); 538 } else { 539 return src; 540 } 541 } 542 543 /** 544 * Workaround for register access modes not supported by the ternary 545 * instruction encoding. 546 */ 547 src_reg 548 fix_3src_operand(const src_reg &src) const 549 { 550 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 551 * able to use vertical stride of zero to replicate the vec4 uniform, like 552 * 553 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 554 * 555 * But you can't, since vertical stride is always four in three-source 556 * instructions. Instead, insert a MOV instruction to do the replication so 557 * that the three-source instruction can consume it. 558 */ 559 560 /* The MOV is only needed if the source is a uniform or immediate. */ 561 if (src.file != UNIFORM && src.file != IMM) 562 return src; 563 564 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 565 return src; 566 567 const dst_reg expanded = vgrf(src.type); 568 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 569 return src_reg(expanded); 570 } 571 572 /** 573 * Workaround for register access modes not supported by the math 574 * instruction. 575 */ 576 src_reg 577 fix_math_operand(const src_reg &src) const 578 { 579 /* The gen6 math instruction ignores the source modifiers -- 580 * swizzle, abs, negate, and at least some parts of the register 581 * region description. 582 * 583 * Rather than trying to enumerate all these cases, *always* expand the 584 * operand to a temp GRF for gen6. 585 * 586 * For gen7, keep the operand as-is, except if immediate, which gen7 still 587 * can't use. 588 */ 589 if (shader->devinfo->gen == 6 || 590 (shader->devinfo->gen == 7 && src.file == IMM)) { 591 const dst_reg tmp = vgrf(src.type); 592 MOV(tmp, src); 593 return src_reg(tmp); 594 } else { 595 return src; 596 } 597 } 598 599 /** 600 * Workaround other weirdness of the math instruction. 601 */ 602 instruction * 603 fix_math_instruction(instruction *inst) const 604 { 605 if (shader->devinfo->gen == 6 && 606 inst->dst.writemask != WRITEMASK_XYZW) { 607 const dst_reg tmp = vgrf(inst->dst.type); 608 MOV(inst->dst, src_reg(tmp)); 609 inst->dst = tmp; 610 611 } else if (shader->devinfo->gen < 6) { 612 const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2); 613 inst->base_mrf = 1; 614 inst->mlen = sources; 615 } 616 617 return inst; 618 } 619 620 bblock_t *block; 621 exec_node *cursor; 622 623 unsigned _dispatch_width; 624 unsigned _group; 625 bool force_writemask_all; 626 627 /** Debug annotation info. */ 628 struct { 629 const char *str; 630 const void *ir; 631 } annotation; 632 }; 633 } 634 635 #endif 636