1 /* 2 * Copyright 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "brw_vec4.h" 25 #include "brw_cfg.h" 26 #include "brw_eu.h" 27 #include "brw_program.h" 28 29 namespace brw { 30 31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst, 32 const src_reg &src0, const src_reg &src1, 33 const src_reg &src2) 34 { 35 this->opcode = opcode; 36 this->dst = dst; 37 this->src[0] = src0; 38 this->src[1] = src1; 39 this->src[2] = src2; 40 this->saturate = false; 41 this->force_writemask_all = false; 42 this->no_dd_clear = false; 43 this->no_dd_check = false; 44 this->writes_accumulator = false; 45 this->conditional_mod = BRW_CONDITIONAL_NONE; 46 this->predicate = BRW_PREDICATE_NONE; 47 this->predicate_inverse = false; 48 this->target = 0; 49 this->shadow_compare = false; 50 this->ir = NULL; 51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 52 this->header_size = 0; 53 this->flag_subreg = 0; 54 this->mlen = 0; 55 this->base_mrf = 0; 56 this->offset = 0; 57 this->exec_size = 8; 58 this->group = 0; 59 this->size_written = (dst.file == BAD_FILE ? 60 0 : this->exec_size * type_sz(dst.type)); 61 this->annotation = NULL; 62 } 63 64 vec4_instruction * 65 vec4_visitor::emit(vec4_instruction *inst) 66 { 67 inst->ir = this->base_ir; 68 inst->annotation = this->current_annotation; 69 70 this->instructions.push_tail(inst); 71 72 return inst; 73 } 74 75 vec4_instruction * 76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst, 77 vec4_instruction *new_inst) 78 { 79 new_inst->ir = inst->ir; 80 new_inst->annotation = inst->annotation; 81 82 inst->insert_before(block, new_inst); 83 84 return inst; 85 } 86 87 vec4_instruction * 88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 89 const src_reg &src1, const src_reg &src2) 90 { 91 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); 92 } 93 94 95 vec4_instruction * 96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 97 const src_reg &src1) 98 { 99 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); 100 } 101 102 vec4_instruction * 103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) 104 { 105 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); 106 } 107 108 vec4_instruction * 109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst) 110 { 111 return emit(new(mem_ctx) vec4_instruction(opcode, dst)); 112 } 113 114 vec4_instruction * 115 vec4_visitor::emit(enum opcode opcode) 116 { 117 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); 118 } 119 120 #define ALU1(op) \ 121 vec4_instruction * \ 122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ 123 { \ 124 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \ 125 } 126 127 #define ALU2(op) \ 128 vec4_instruction * \ 129 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 130 const src_reg &src1) \ 131 { \ 132 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 133 src0, src1); \ 134 } 135 136 #define ALU2_ACC(op) \ 137 vec4_instruction * \ 138 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 139 const src_reg &src1) \ 140 { \ 141 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ 142 BRW_OPCODE_##op, dst, src0, src1); \ 143 inst->writes_accumulator = true; \ 144 return inst; \ 145 } 146 147 #define ALU3(op) \ 148 vec4_instruction * \ 149 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 150 const src_reg &src1, const src_reg &src2) \ 151 { \ 152 assert(devinfo->gen >= 6); \ 153 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 154 src0, src1, src2); \ 155 } 156 157 ALU1(NOT) 158 ALU1(MOV) 159 ALU1(FRC) 160 ALU1(RNDD) 161 ALU1(RNDE) 162 ALU1(RNDZ) 163 ALU1(F32TO16) 164 ALU1(F16TO32) 165 ALU2(ADD) 166 ALU2(MUL) 167 ALU2_ACC(MACH) 168 ALU2(AND) 169 ALU2(OR) 170 ALU2(XOR) 171 ALU2(DP3) 172 ALU2(DP4) 173 ALU2(DPH) 174 ALU2(SHL) 175 ALU2(SHR) 176 ALU2(ASR) 177 ALU3(LRP) 178 ALU1(BFREV) 179 ALU3(BFE) 180 ALU2(BFI1) 181 ALU3(BFI2) 182 ALU1(FBH) 183 ALU1(FBL) 184 ALU1(CBIT) 185 ALU3(MAD) 186 ALU2_ACC(ADDC) 187 ALU2_ACC(SUBB) 188 ALU2(MAC) 189 ALU1(DIM) 190 191 /** Gen4 predicated IF. */ 192 vec4_instruction * 193 vec4_visitor::IF(enum brw_predicate predicate) 194 { 195 vec4_instruction *inst; 196 197 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF); 198 inst->predicate = predicate; 199 200 return inst; 201 } 202 203 /** Gen6 IF with embedded comparison. */ 204 vec4_instruction * 205 vec4_visitor::IF(src_reg src0, src_reg src1, 206 enum brw_conditional_mod condition) 207 { 208 assert(devinfo->gen == 6); 209 210 vec4_instruction *inst; 211 212 resolve_ud_negate(&src0); 213 resolve_ud_negate(&src1); 214 215 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(), 216 src0, src1); 217 inst->conditional_mod = condition; 218 219 return inst; 220 } 221 222 /** 223 * CMP: Sets the low bit of the destination channels with the result 224 * of the comparison, while the upper bits are undefined, and updates 225 * the flag register with the packed 16 bits of the result. 226 */ 227 vec4_instruction * 228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, 229 enum brw_conditional_mod condition) 230 { 231 vec4_instruction *inst; 232 233 /* Take the instruction: 234 * 235 * CMP null<d> src0<f> src1<f> 236 * 237 * Original gen4 does type conversion to the destination type before 238 * comparison, producing garbage results for floating point comparisons. 239 * 240 * The destination type doesn't matter on newer generations, so we set the 241 * type to match src0 so we can compact the instruction. 242 */ 243 dst.type = src0.type; 244 245 resolve_ud_negate(&src0); 246 resolve_ud_negate(&src1); 247 248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1); 249 inst->conditional_mod = condition; 250 251 return inst; 252 } 253 254 vec4_instruction * 255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) 256 { 257 vec4_instruction *inst; 258 259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ, 260 dst, index); 261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1; 262 inst->mlen = 2; 263 264 return inst; 265 } 266 267 vec4_instruction * 268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, 269 const src_reg &index) 270 { 271 vec4_instruction *inst; 272 273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE, 274 dst, src, index); 275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen); 276 inst->mlen = 3; 277 278 return inst; 279 } 280 281 src_reg 282 vec4_visitor::fix_3src_operand(const src_reg &src) 283 { 284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 285 * able to use vertical stride of zero to replicate the vec4 uniform, like 286 * 287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 288 * 289 * But you can't, since vertical stride is always four in three-source 290 * instructions. Instead, insert a MOV instruction to do the replication so 291 * that the three-source instruction can consume it. 292 */ 293 294 /* The MOV is only needed if the source is a uniform or immediate. */ 295 if (src.file != UNIFORM && src.file != IMM) 296 return src; 297 298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 299 return src; 300 301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 302 expanded.type = src.type; 303 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 304 return src_reg(expanded); 305 } 306 307 src_reg 308 vec4_visitor::resolve_source_modifiers(const src_reg &src) 309 { 310 if (!src.abs && !src.negate) 311 return src; 312 313 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type); 314 resolved.type = src.type; 315 emit(MOV(resolved, src)); 316 317 return src_reg(resolved); 318 } 319 320 src_reg 321 vec4_visitor::fix_math_operand(const src_reg &src) 322 { 323 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE) 324 return src; 325 326 /* The gen6 math instruction ignores the source modifiers -- 327 * swizzle, abs, negate, and at least some parts of the register 328 * region description. 329 * 330 * Rather than trying to enumerate all these cases, *always* expand the 331 * operand to a temp GRF for gen6. 332 * 333 * For gen7, keep the operand as-is, except if immediate, which gen7 still 334 * can't use. 335 */ 336 337 if (devinfo->gen == 7 && src.file != IMM) 338 return src; 339 340 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 341 expanded.type = src.type; 342 emit(MOV(expanded, src)); 343 return src_reg(expanded); 344 } 345 346 vec4_instruction * 347 vec4_visitor::emit_math(enum opcode opcode, 348 const dst_reg &dst, 349 const src_reg &src0, const src_reg &src1) 350 { 351 vec4_instruction *math = 352 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); 353 354 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) { 355 /* MATH on Gen6 must be align1, so we can't do writemasks. */ 356 math->dst = dst_reg(this, glsl_type::vec4_type); 357 math->dst.type = dst.type; 358 math = emit(MOV(dst, src_reg(math->dst))); 359 } else if (devinfo->gen < 6) { 360 math->base_mrf = 1; 361 math->mlen = src1.file == BAD_FILE ? 1 : 2; 362 } 363 364 return math; 365 } 366 367 void 368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) 369 { 370 if (devinfo->gen < 7) { 371 unreachable("ir_unop_pack_half_2x16 should be lowered"); 372 } 373 374 assert(dst.type == BRW_REGISTER_TYPE_UD); 375 assert(src0.type == BRW_REGISTER_TYPE_F); 376 377 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 378 * 379 * Because this instruction does not have a 16-bit floating-point type, 380 * the destination data type must be Word (W). 381 * 382 * The destination must be DWord-aligned and specify a horizontal stride 383 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 384 * each destination channel and the upper word is not modified. 385 * 386 * The above restriction implies that the f32to16 instruction must use 387 * align1 mode, because only in align1 mode is it possible to specify 388 * horizontal stride. We choose here to defy the hardware docs and emit 389 * align16 instructions. 390 * 391 * (I [chadv] did attempt to emit align1 instructions for VS f32to16 392 * instructions. I was partially successful in that the code passed all 393 * tests. However, the code was dubiously correct and fragile, and the 394 * tests were not harsh enough to probe that frailty. Not trusting the 395 * code, I chose instead to remain in align16 mode in defiance of the hw 396 * docs). 397 * 398 * I've [chadv] experimentally confirmed that, on gen7 hardware and the 399 * simulator, emitting a f32to16 in align16 mode with UD as destination 400 * data type is safe. The behavior differs from that specified in the PRM 401 * in that the upper word of each destination channel is cleared to 0. 402 */ 403 404 dst_reg tmp_dst(this, glsl_type::uvec2_type); 405 src_reg tmp_src(tmp_dst); 406 407 #if 0 408 /* Verify the undocumented behavior on which the following instructions 409 * rely. If f32to16 fails to clear the upper word of the X and Y channels, 410 * then the result of the bit-or instruction below will be incorrect. 411 * 412 * You should inspect the disasm output in order to verify that the MOV is 413 * not optimized away. 414 */ 415 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u))); 416 #endif 417 418 /* Give tmp the form below, where "." means untouched. 419 * 420 * w z y x w z y x 421 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| 422 * 423 * That the upper word of each write-channel be 0 is required for the 424 * following bit-shift and bit-or instructions to work. Note that this 425 * relies on the undocumented hardware behavior mentioned above. 426 */ 427 tmp_dst.writemask = WRITEMASK_XY; 428 emit(F32TO16(tmp_dst, src0)); 429 430 /* Give the write-channels of dst the form: 431 * 0xhhhh0000 432 */ 433 tmp_src.swizzle = BRW_SWIZZLE_YYYY; 434 emit(SHL(dst, tmp_src, brw_imm_ud(16u))); 435 436 /* Finally, give the write-channels of dst the form of packHalf2x16's 437 * output: 438 * 0xhhhhllll 439 */ 440 tmp_src.swizzle = BRW_SWIZZLE_XXXX; 441 emit(OR(dst, src_reg(dst), tmp_src)); 442 } 443 444 void 445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) 446 { 447 if (devinfo->gen < 7) { 448 unreachable("ir_unop_unpack_half_2x16 should be lowered"); 449 } 450 451 assert(dst.type == BRW_REGISTER_TYPE_F); 452 assert(src0.type == BRW_REGISTER_TYPE_UD); 453 454 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 455 * 456 * Because this instruction does not have a 16-bit floating-point type, 457 * the source data type must be Word (W). The destination type must be 458 * F (Float). 459 * 460 * To use W as the source data type, we must adjust horizontal strides, 461 * which is only possible in align1 mode. All my [chadv] attempts at 462 * emitting align1 instructions for unpackHalf2x16 failed to pass the 463 * Piglit tests, so I gave up. 464 * 465 * I've verified that, on gen7 hardware and the simulator, it is safe to 466 * emit f16to32 in align16 mode with UD as source data type. 467 */ 468 469 dst_reg tmp_dst(this, glsl_type::uvec2_type); 470 src_reg tmp_src(tmp_dst); 471 472 tmp_dst.writemask = WRITEMASK_X; 473 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu))); 474 475 tmp_dst.writemask = WRITEMASK_Y; 476 emit(SHR(tmp_dst, src0, brw_imm_ud(16u))); 477 478 dst.writemask = WRITEMASK_XY; 479 emit(F16TO32(dst, tmp_src)); 480 } 481 482 void 483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) 484 { 485 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 486 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 487 * is not suitable to generate the shift values, but we can use the packed 488 * vector float and a type-converting MOV. 489 */ 490 dst_reg shift(this, glsl_type::uvec4_type); 491 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 492 493 dst_reg shifted(this, glsl_type::uvec4_type); 494 src0.swizzle = BRW_SWIZZLE_XXXX; 495 emit(SHR(shifted, src0, src_reg(shift))); 496 497 shifted.type = BRW_REGISTER_TYPE_UB; 498 dst_reg f(this, glsl_type::vec4_type); 499 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 500 501 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f))); 502 } 503 504 void 505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) 506 { 507 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 508 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 509 * is not suitable to generate the shift values, but we can use the packed 510 * vector float and a type-converting MOV. 511 */ 512 dst_reg shift(this, glsl_type::uvec4_type); 513 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 514 515 dst_reg shifted(this, glsl_type::uvec4_type); 516 src0.swizzle = BRW_SWIZZLE_XXXX; 517 emit(SHR(shifted, src0, src_reg(shift))); 518 519 shifted.type = BRW_REGISTER_TYPE_B; 520 dst_reg f(this, glsl_type::vec4_type); 521 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 522 523 dst_reg scaled(this, glsl_type::vec4_type); 524 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f))); 525 526 dst_reg max(this, glsl_type::vec4_type); 527 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f)); 528 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f)); 529 } 530 531 void 532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) 533 { 534 dst_reg saturated(this, glsl_type::vec4_type); 535 vec4_instruction *inst = emit(MOV(saturated, src0)); 536 inst->saturate = true; 537 538 dst_reg scaled(this, glsl_type::vec4_type); 539 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f))); 540 541 dst_reg rounded(this, glsl_type::vec4_type); 542 emit(RNDE(rounded, src_reg(scaled))); 543 544 dst_reg u(this, glsl_type::uvec4_type); 545 emit(MOV(u, src_reg(rounded))); 546 547 src_reg bytes(u); 548 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 549 } 550 551 void 552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) 553 { 554 dst_reg max(this, glsl_type::vec4_type); 555 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f)); 556 557 dst_reg min(this, glsl_type::vec4_type); 558 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f)); 559 560 dst_reg scaled(this, glsl_type::vec4_type); 561 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f))); 562 563 dst_reg rounded(this, glsl_type::vec4_type); 564 emit(RNDE(rounded, src_reg(scaled))); 565 566 dst_reg i(this, glsl_type::ivec4_type); 567 emit(MOV(i, src_reg(rounded))); 568 569 src_reg bytes(i); 570 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 571 } 572 573 /* 574 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 == 575 * false) elements needed to pack a type. 576 */ 577 static int 578 type_size_xvec4(const struct glsl_type *type, bool as_vec4) 579 { 580 unsigned int i; 581 int size; 582 583 switch (type->base_type) { 584 case GLSL_TYPE_UINT: 585 case GLSL_TYPE_INT: 586 case GLSL_TYPE_FLOAT: 587 case GLSL_TYPE_BOOL: 588 case GLSL_TYPE_DOUBLE: 589 if (type->is_matrix()) { 590 const glsl_type *col_type = type->column_type(); 591 unsigned col_slots = 592 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1; 593 return type->matrix_columns * col_slots; 594 } else { 595 /* Regardless of size of vector, it gets a vec4. This is bad 596 * packing for things like floats, but otherwise arrays become a 597 * mess. Hopefully a later pass over the code can pack scalars 598 * down if appropriate. 599 */ 600 return (as_vec4 && type->is_dual_slot()) ? 2 : 1; 601 } 602 case GLSL_TYPE_ARRAY: 603 assert(type->length > 0); 604 return type_size_xvec4(type->fields.array, as_vec4) * type->length; 605 case GLSL_TYPE_STRUCT: 606 size = 0; 607 for (i = 0; i < type->length; i++) { 608 size += type_size_xvec4(type->fields.structure[i].type, as_vec4); 609 } 610 return size; 611 case GLSL_TYPE_SUBROUTINE: 612 return 1; 613 614 case GLSL_TYPE_SAMPLER: 615 /* Samplers take up no register space, since they're baked in at 616 * link time. 617 */ 618 return 0; 619 case GLSL_TYPE_ATOMIC_UINT: 620 return 0; 621 case GLSL_TYPE_IMAGE: 622 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4); 623 case GLSL_TYPE_VOID: 624 case GLSL_TYPE_ERROR: 625 case GLSL_TYPE_INTERFACE: 626 case GLSL_TYPE_FUNCTION: 627 unreachable("not reached"); 628 } 629 630 return 0; 631 } 632 633 /** 634 * Returns the minimum number of vec4 elements needed to pack a type. 635 * 636 * For simple types, it will return 1 (a single vec4); for matrices, the 637 * number of columns; for array and struct, the sum of the vec4_size of 638 * each of its elements; and for sampler and atomic, zero. 639 * 640 * This method is useful to calculate how much register space is needed to 641 * store a particular type. 642 */ 643 extern "C" int 644 type_size_vec4(const struct glsl_type *type) 645 { 646 return type_size_xvec4(type, true); 647 } 648 649 /** 650 * Returns the minimum number of dvec4 elements needed to pack a type. 651 * 652 * For simple types, it will return 1 (a single dvec4); for matrices, the 653 * number of columns; for array and struct, the sum of the dvec4_size of 654 * each of its elements; and for sampler and atomic, zero. 655 * 656 * This method is useful to calculate how much register space is needed to 657 * store a particular type. 658 * 659 * Measuring double-precision vertex inputs as dvec4 is required because 660 * ARB_vertex_attrib_64bit states that these uses the same number of locations 661 * than the single-precision version. That is, two consecutives dvec4 would be 662 * located in location "x" and location "x+1", not "x+2". 663 * 664 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs, 665 * remap_vs_attrs() will take in account both the location and also if the 666 * type fits in one or two vec4 slots. 667 */ 668 extern "C" int 669 type_size_dvec4(const struct glsl_type *type) 670 { 671 return type_size_xvec4(type, false); 672 } 673 674 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) 675 { 676 init(); 677 678 this->file = VGRF; 679 this->nr = v->alloc.allocate(type_size_vec4(type)); 680 681 if (type->is_array() || type->is_record()) { 682 this->swizzle = BRW_SWIZZLE_NOOP; 683 } else { 684 this->swizzle = brw_swizzle_for_size(type->vector_elements); 685 } 686 687 this->type = brw_type_for_base_type(type); 688 } 689 690 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) 691 { 692 assert(size > 0); 693 694 init(); 695 696 this->file = VGRF; 697 this->nr = v->alloc.allocate(type_size_vec4(type) * size); 698 699 this->swizzle = BRW_SWIZZLE_NOOP; 700 701 this->type = brw_type_for_base_type(type); 702 } 703 704 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) 705 { 706 init(); 707 708 this->file = VGRF; 709 this->nr = v->alloc.allocate(type_size_vec4(type)); 710 711 if (type->is_array() || type->is_record()) { 712 this->writemask = WRITEMASK_XYZW; 713 } else { 714 this->writemask = (1 << type->vector_elements) - 1; 715 } 716 717 this->type = brw_type_for_base_type(type); 718 } 719 720 vec4_instruction * 721 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, 722 src_reg src0, src_reg src1) 723 { 724 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1); 725 inst->conditional_mod = conditionalmod; 726 return inst; 727 } 728 729 vec4_instruction * 730 vec4_visitor::emit_lrp(const dst_reg &dst, 731 const src_reg &x, const src_reg &y, const src_reg &a) 732 { 733 if (devinfo->gen >= 6) { 734 /* Note that the instruction's argument order is reversed from GLSL 735 * and the IR. 736 */ 737 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y), 738 fix_3src_operand(x))); 739 } else { 740 /* Earlier generations don't support three source operations, so we 741 * need to emit x*(1-a) + y*a. 742 */ 743 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type); 744 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type); 745 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type); 746 y_times_a.writemask = dst.writemask; 747 one_minus_a.writemask = dst.writemask; 748 x_times_one_minus_a.writemask = dst.writemask; 749 750 emit(MUL(y_times_a, y, a)); 751 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f))); 752 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a))); 753 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a))); 754 } 755 } 756 757 /** 758 * Emits the instructions needed to perform a pull constant load. before_block 759 * and before_inst can be NULL in which case the instruction will be appended 760 * to the end of the instruction list. 761 */ 762 void 763 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, 764 src_reg surf_index, 765 src_reg offset_reg, 766 bblock_t *before_block, 767 vec4_instruction *before_inst) 768 { 769 assert((before_inst == NULL && before_block == NULL) || 770 (before_inst && before_block)); 771 772 vec4_instruction *pull; 773 774 if (devinfo->gen >= 9) { 775 /* Gen9+ needs a message header in order to use SIMD4x2 mode */ 776 src_reg header(this, glsl_type::uvec4_type, 2); 777 778 pull = new(mem_ctx) 779 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, 780 dst_reg(header)); 781 782 if (before_inst) 783 emit_before(before_block, before_inst, pull); 784 else 785 emit(pull); 786 787 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE), 788 offset_reg.type); 789 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg); 790 791 if (before_inst) 792 emit_before(before_block, before_inst, pull); 793 else 794 emit(pull); 795 796 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, 797 dst, 798 surf_index, 799 header); 800 pull->mlen = 2; 801 pull->header_size = 1; 802 } else if (devinfo->gen >= 7) { 803 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type); 804 805 grf_offset.type = offset_reg.type; 806 807 pull = MOV(grf_offset, offset_reg); 808 809 if (before_inst) 810 emit_before(before_block, before_inst, pull); 811 else 812 emit(pull); 813 814 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, 815 dst, 816 surf_index, 817 src_reg(grf_offset)); 818 pull->mlen = 1; 819 } else { 820 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD, 821 dst, 822 surf_index, 823 offset_reg); 824 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1; 825 pull->mlen = 1; 826 } 827 828 if (before_inst) 829 emit_before(before_block, before_inst, pull); 830 else 831 emit(pull); 832 } 833 834 src_reg 835 vec4_visitor::emit_uniformize(const src_reg &src) 836 { 837 const src_reg chan_index(this, glsl_type::uint_type); 838 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type), 839 src.type); 840 841 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) 842 ->force_writemask_all = true; 843 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index) 844 ->force_writemask_all = true; 845 846 return src_reg(dst); 847 } 848 849 src_reg 850 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type, 851 src_reg coordinate, src_reg surface) 852 { 853 vec4_instruction *inst = 854 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS, 855 dst_reg(this, glsl_type::uvec4_type)); 856 inst->base_mrf = 2; 857 inst->src[1] = surface; 858 inst->src[2] = surface; 859 860 int param_base; 861 862 if (devinfo->gen >= 9) { 863 /* Gen9+ needs a message header in order to use SIMD4x2 mode */ 864 vec4_instruction *header_inst = new(mem_ctx) 865 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, 866 dst_reg(MRF, inst->base_mrf)); 867 868 emit(header_inst); 869 870 inst->mlen = 2; 871 inst->header_size = 1; 872 param_base = inst->base_mrf + 1; 873 } else { 874 inst->mlen = 1; 875 param_base = inst->base_mrf; 876 } 877 878 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */ 879 int coord_mask = (1 << coordinate_type->vector_elements) - 1; 880 int zero_mask = 0xf & ~coord_mask; 881 882 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask), 883 coordinate)); 884 885 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask), 886 brw_imm_d(0))); 887 888 emit(inst); 889 return src_reg(inst->dst); 890 } 891 892 bool 893 vec4_visitor::is_high_sampler(src_reg sampler) 894 { 895 if (devinfo->gen < 8 && !devinfo->is_haswell) 896 return false; 897 898 return sampler.file != IMM || sampler.ud >= 16; 899 } 900 901 void 902 vec4_visitor::emit_texture(ir_texture_opcode op, 903 dst_reg dest, 904 const glsl_type *dest_type, 905 src_reg coordinate, 906 int coord_components, 907 src_reg shadow_comparator, 908 src_reg lod, src_reg lod2, 909 src_reg sample_index, 910 uint32_t constant_offset, 911 src_reg offset_value, 912 src_reg mcs, 913 uint32_t surface, 914 src_reg surface_reg, 915 src_reg sampler_reg) 916 { 917 /* The sampler can only meaningfully compute LOD for fragment shader 918 * messages. For all other stages, we change the opcode to TXL and hardcode 919 * the LOD to 0. 920 * 921 * textureQueryLevels() is implemented in terms of TXS so we need to pass a 922 * valid LOD argument. 923 */ 924 if (op == ir_tex || op == ir_query_levels) { 925 assert(lod.file == BAD_FILE); 926 lod = brw_imm_f(0.0f); 927 } 928 929 enum opcode opcode; 930 switch (op) { 931 case ir_tex: opcode = SHADER_OPCODE_TXL; break; 932 case ir_txl: opcode = SHADER_OPCODE_TXL; break; 933 case ir_txd: opcode = SHADER_OPCODE_TXD; break; 934 case ir_txf: opcode = SHADER_OPCODE_TXF; break; 935 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W : 936 SHADER_OPCODE_TXF_CMS); break; 937 case ir_txs: opcode = SHADER_OPCODE_TXS; break; 938 case ir_tg4: opcode = offset_value.file != BAD_FILE 939 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break; 940 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break; 941 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break; 942 case ir_txb: 943 unreachable("TXB is not valid for vertex shaders."); 944 case ir_lod: 945 unreachable("LOD is not valid for vertex shaders."); 946 case ir_samples_identical: { 947 /* There are some challenges implementing this for vec4, and it seems 948 * unlikely to be used anyway. For now, just return false ways. 949 */ 950 emit(MOV(dest, brw_imm_ud(0u))); 951 return; 952 } 953 default: 954 unreachable("Unrecognized tex op"); 955 } 956 957 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest); 958 959 inst->offset = constant_offset; 960 961 /* The message header is necessary for: 962 * - Gen4 (always) 963 * - Gen9+ for selecting SIMD4x2 964 * - Texel offsets 965 * - Gather channel selection 966 * - Sampler indices too large to fit in a 4-bit value. 967 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal 968 */ 969 inst->header_size = 970 (devinfo->gen < 5 || devinfo->gen >= 9 || 971 inst->offset != 0 || op == ir_tg4 || 972 op == ir_texture_samples || 973 is_high_sampler(sampler_reg)) ? 1 : 0; 974 inst->base_mrf = 2; 975 inst->mlen = inst->header_size; 976 inst->dst.writemask = WRITEMASK_XYZW; 977 inst->shadow_compare = shadow_comparator.file != BAD_FILE; 978 979 inst->src[1] = surface_reg; 980 inst->src[2] = sampler_reg; 981 982 /* MRF for the first parameter */ 983 int param_base = inst->base_mrf + inst->header_size; 984 985 if (op == ir_txs || op == ir_query_levels) { 986 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X; 987 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod)); 988 inst->mlen++; 989 } else if (op == ir_texture_samples) { 990 inst->dst.writemask = WRITEMASK_X; 991 } else { 992 /* Load the coordinate */ 993 /* FINISHME: gl_clamp_mask and saturate */ 994 int coord_mask = (1 << coord_components) - 1; 995 int zero_mask = 0xf & ~coord_mask; 996 997 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask), 998 coordinate)); 999 inst->mlen++; 1000 1001 if (zero_mask != 0) { 1002 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask), 1003 brw_imm_d(0))); 1004 } 1005 /* Load the shadow comparator */ 1006 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) { 1007 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type, 1008 WRITEMASK_X), 1009 shadow_comparator)); 1010 inst->mlen++; 1011 } 1012 1013 /* Load the LOD info */ 1014 if (op == ir_tex || op == ir_txl) { 1015 int mrf, writemask; 1016 if (devinfo->gen >= 5) { 1017 mrf = param_base + 1; 1018 if (shadow_comparator.file != BAD_FILE) { 1019 writemask = WRITEMASK_Y; 1020 /* mlen already incremented */ 1021 } else { 1022 writemask = WRITEMASK_X; 1023 inst->mlen++; 1024 } 1025 } else /* devinfo->gen == 4 */ { 1026 mrf = param_base; 1027 writemask = WRITEMASK_W; 1028 } 1029 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod)); 1030 } else if (op == ir_txf) { 1031 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod)); 1032 } else if (op == ir_txf_ms) { 1033 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), 1034 sample_index)); 1035 if (opcode == SHADER_OPCODE_TXF_CMS_W) { 1036 /* MCS data is stored in the first two channels of mcs, but we 1037 * need to get it into the .y and .z channels of the second vec4 1038 * of params. 1039 */ 1040 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1); 1041 emit(MOV(dst_reg(MRF, param_base + 1, 1042 glsl_type::uint_type, WRITEMASK_YZ), 1043 mcs)); 1044 } else if (devinfo->gen >= 7) { 1045 /* MCS data is in the first channel of `mcs`, but we need to get it into 1046 * the .y channel of the second vec4 of params, so replicate .x across 1047 * the whole vec4 and then mask off everything except .y 1048 */ 1049 mcs.swizzle = BRW_SWIZZLE_XXXX; 1050 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y), 1051 mcs)); 1052 } 1053 inst->mlen++; 1054 } else if (op == ir_txd) { 1055 const brw_reg_type type = lod.type; 1056 1057 if (devinfo->gen >= 5) { 1058 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 1059 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 1060 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod)); 1061 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2)); 1062 inst->mlen++; 1063 1064 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) { 1065 lod.swizzle = BRW_SWIZZLE_ZZZZ; 1066 lod2.swizzle = BRW_SWIZZLE_ZZZZ; 1067 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod)); 1068 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2)); 1069 inst->mlen++; 1070 1071 if (shadow_comparator.file != BAD_FILE) { 1072 emit(MOV(dst_reg(MRF, param_base + 2, 1073 shadow_comparator.type, WRITEMASK_Z), 1074 shadow_comparator)); 1075 } 1076 } 1077 } else /* devinfo->gen == 4 */ { 1078 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod)); 1079 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2)); 1080 inst->mlen += 2; 1081 } 1082 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) { 1083 if (shadow_comparator.file != BAD_FILE) { 1084 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W), 1085 shadow_comparator)); 1086 } 1087 1088 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY), 1089 offset_value)); 1090 inst->mlen++; 1091 } 1092 } 1093 1094 emit(inst); 1095 1096 /* fixup num layers (z) for cube arrays: hardware returns faces * layers; 1097 * spec requires layers. 1098 */ 1099 if (op == ir_txs && devinfo->gen < 7) { 1100 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ 1101 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), 1102 src_reg(inst->dst), brw_imm_d(1)); 1103 } 1104 1105 if (devinfo->gen == 6 && op == ir_tg4) { 1106 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst); 1107 } 1108 1109 if (op == ir_query_levels) { 1110 /* # levels is in .w */ 1111 src_reg swizzled(dest); 1112 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, 1113 SWIZZLE_W, SWIZZLE_W); 1114 emit(MOV(dest, swizzled)); 1115 } 1116 } 1117 1118 /** 1119 * Apply workarounds for Gen6 gather with UINT/SINT 1120 */ 1121 void 1122 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst) 1123 { 1124 if (!wa) 1125 return; 1126 1127 int width = (wa & WA_8BIT) ? 8 : 16; 1128 dst_reg dst_f = dst; 1129 dst_f.type = BRW_REGISTER_TYPE_F; 1130 1131 /* Convert from UNORM to UINT */ 1132 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1)))); 1133 emit(MOV(dst, src_reg(dst_f))); 1134 1135 if (wa & WA_SIGN) { 1136 /* Reinterpret the UINT value as a signed INT value by 1137 * shifting the sign bit into place, then shifting back 1138 * preserving sign. 1139 */ 1140 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width))); 1141 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width))); 1142 } 1143 } 1144 1145 void 1146 vec4_visitor::gs_emit_vertex(int /* stream_id */) 1147 { 1148 unreachable("not reached"); 1149 } 1150 1151 void 1152 vec4_visitor::gs_end_primitive() 1153 { 1154 unreachable("not reached"); 1155 } 1156 1157 void 1158 vec4_visitor::emit_ndc_computation() 1159 { 1160 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) 1161 return; 1162 1163 /* Get the position */ 1164 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); 1165 1166 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ 1167 dst_reg ndc = dst_reg(this, glsl_type::vec4_type); 1168 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc; 1169 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4; 1170 1171 current_annotation = "NDC"; 1172 dst_reg ndc_w = ndc; 1173 ndc_w.writemask = WRITEMASK_W; 1174 src_reg pos_w = pos; 1175 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); 1176 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w); 1177 1178 dst_reg ndc_xyz = ndc; 1179 ndc_xyz.writemask = WRITEMASK_XYZ; 1180 1181 emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); 1182 } 1183 1184 void 1185 vec4_visitor::emit_psiz_and_flags(dst_reg reg) 1186 { 1187 if (devinfo->gen < 6 && 1188 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || 1189 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || 1190 devinfo->has_negative_rhw_bug)) { 1191 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type); 1192 dst_reg header1_w = header1; 1193 header1_w.writemask = WRITEMASK_W; 1194 1195 emit(MOV(header1, brw_imm_ud(0u))); 1196 1197 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 1198 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1199 1200 current_annotation = "Point size"; 1201 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11)))); 1202 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8))); 1203 } 1204 1205 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { 1206 current_annotation = "Clipping flags"; 1207 dst_reg flags0 = dst_reg(this, glsl_type::uint_type); 1208 dst_reg flags1 = dst_reg(this, glsl_type::uint_type); 1209 1210 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1211 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0)); 1212 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); 1213 1214 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1215 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0)); 1216 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4))); 1217 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); 1218 } 1219 1220 /* i965 clipping workaround: 1221 * 1) Test for -ve rhw 1222 * 2) If set, 1223 * set ndc = (0,0,0,0) 1224 * set ucp[6] = 1 1225 * 1226 * Later, clipping will detect ucp[6] and ensure the primitive is 1227 * clipped against all fixed planes. 1228 */ 1229 if (devinfo->has_negative_rhw_bug && 1230 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) { 1231 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]); 1232 ndc_w.swizzle = BRW_SWIZZLE_WWWW; 1233 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1234 vec4_instruction *inst; 1235 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6))); 1236 inst->predicate = BRW_PREDICATE_NORMAL; 1237 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F; 1238 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f))); 1239 inst->predicate = BRW_PREDICATE_NORMAL; 1240 } 1241 1242 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); 1243 } else if (devinfo->gen < 6) { 1244 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u))); 1245 } else { 1246 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0))); 1247 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 1248 dst_reg reg_w = reg; 1249 reg_w.writemask = WRITEMASK_W; 1250 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1251 reg_as_src.type = reg_w.type; 1252 reg_as_src.swizzle = brw_swizzle_for_size(1); 1253 emit(MOV(reg_w, reg_as_src)); 1254 } 1255 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) { 1256 dst_reg reg_y = reg; 1257 reg_y.writemask = WRITEMASK_Y; 1258 reg_y.type = BRW_REGISTER_TYPE_D; 1259 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; 1260 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); 1261 } 1262 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) { 1263 dst_reg reg_z = reg; 1264 reg_z.writemask = WRITEMASK_Z; 1265 reg_z.type = BRW_REGISTER_TYPE_D; 1266 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; 1267 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); 1268 } 1269 } 1270 } 1271 1272 vec4_instruction * 1273 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) 1274 { 1275 assert(varying < VARYING_SLOT_MAX); 1276 1277 unsigned num_comps = output_num_components[varying][component]; 1278 if (num_comps == 0) 1279 return NULL; 1280 1281 assert(output_reg[varying][component].type == reg.type); 1282 current_annotation = output_reg_annotation[varying]; 1283 if (output_reg[varying][component].file != BAD_FILE) { 1284 src_reg src = src_reg(output_reg[varying][component]); 1285 src.swizzle = BRW_SWZ_COMP_OUTPUT(component); 1286 reg.writemask = 1287 brw_writemask_for_component_packing(num_comps, component); 1288 return emit(MOV(reg, src)); 1289 } 1290 return NULL; 1291 } 1292 1293 void 1294 vec4_visitor::emit_urb_slot(dst_reg reg, int varying) 1295 { 1296 reg.type = BRW_REGISTER_TYPE_F; 1297 output_reg[varying][0].type = reg.type; 1298 1299 switch (varying) { 1300 case VARYING_SLOT_PSIZ: 1301 { 1302 /* PSIZ is always in slot 0, and is coupled with other flags. */ 1303 current_annotation = "indices, point width, clip flags"; 1304 emit_psiz_and_flags(reg); 1305 break; 1306 } 1307 case BRW_VARYING_SLOT_NDC: 1308 current_annotation = "NDC"; 1309 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) 1310 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]))); 1311 break; 1312 case VARYING_SLOT_POS: 1313 current_annotation = "gl_Position"; 1314 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) 1315 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); 1316 break; 1317 case VARYING_SLOT_EDGE: 1318 /* This is present when doing unfilled polygons. We're supposed to copy 1319 * the edge flag from the user-provided vertex array 1320 * (glEdgeFlagPointer), or otherwise we'll copy from the current value 1321 * of that attribute (starts as 1.0f). This is then used in clipping to 1322 * determine which edges should be drawn as wireframe. 1323 */ 1324 current_annotation = "edge flag"; 1325 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG, 1326 glsl_type::float_type, WRITEMASK_XYZW)))); 1327 break; 1328 case BRW_VARYING_SLOT_PAD: 1329 /* No need to write to this slot */ 1330 break; 1331 default: 1332 for (int i = 0; i < 4; i++) { 1333 emit_generic_urb_slot(reg, varying, i); 1334 } 1335 break; 1336 } 1337 } 1338 1339 static int 1340 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen) 1341 { 1342 if (devinfo->gen >= 6) { 1343 /* URB data written (does not include the message header reg) must 1344 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 1345 * section 5.4.3.2.2: URB_INTERLEAVED. 1346 * 1347 * URB entries are allocated on a multiple of 1024 bits, so an 1348 * extra 128 bits written here to make the end align to 256 is 1349 * no problem. 1350 */ 1351 if ((mlen % 2) != 1) 1352 mlen++; 1353 } 1354 1355 return mlen; 1356 } 1357 1358 1359 /** 1360 * Generates the VUE payload plus the necessary URB write instructions to 1361 * output it. 1362 * 1363 * The VUE layout is documented in Volume 2a. 1364 */ 1365 void 1366 vec4_visitor::emit_vertex() 1367 { 1368 /* MRF 0 is reserved for the debugger, so start with message header 1369 * in MRF 1. 1370 */ 1371 int base_mrf = 1; 1372 int mrf = base_mrf; 1373 /* In the process of generating our URB write message contents, we 1374 * may need to unspill a register or load from an array. Those 1375 * reads would use MRFs 14-15. 1376 */ 1377 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen); 1378 1379 /* The following assertion verifies that max_usable_mrf causes an 1380 * even-numbered amount of URB write data, which will meet gen6's 1381 * requirements for length alignment. 1382 */ 1383 assert ((max_usable_mrf - base_mrf) % 2 == 0); 1384 1385 /* First mrf is the g0-based message header containing URB handles and 1386 * such. 1387 */ 1388 emit_urb_write_header(mrf++); 1389 1390 if (devinfo->gen < 6) { 1391 emit_ndc_computation(); 1392 } 1393 1394 /* We may need to split this up into several URB writes, so do them in a 1395 * loop. 1396 */ 1397 int slot = 0; 1398 bool complete = false; 1399 do { 1400 /* URB offset is in URB row increments, and each of our MRFs is half of 1401 * one of those, since we're doing interleaved writes. 1402 */ 1403 int offset = slot / 2; 1404 1405 mrf = base_mrf + 1; 1406 for (; slot < prog_data->vue_map.num_slots; ++slot) { 1407 emit_urb_slot(dst_reg(MRF, mrf++), 1408 prog_data->vue_map.slot_to_varying[slot]); 1409 1410 /* If this was max_usable_mrf, we can't fit anything more into this 1411 * URB WRITE. Same thing if we reached the maximum length available. 1412 */ 1413 if (mrf > max_usable_mrf || 1414 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 1415 slot++; 1416 break; 1417 } 1418 } 1419 1420 complete = slot >= prog_data->vue_map.num_slots; 1421 current_annotation = "URB write"; 1422 vec4_instruction *inst = emit_urb_write_opcode(complete); 1423 inst->base_mrf = base_mrf; 1424 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); 1425 inst->offset += offset; 1426 } while(!complete); 1427 } 1428 1429 1430 src_reg 1431 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, 1432 src_reg *reladdr, int reg_offset) 1433 { 1434 /* Because we store the values to scratch interleaved like our 1435 * vertex data, we need to scale the vec4 index by 2. 1436 */ 1437 int message_header_scale = 2; 1438 1439 /* Pre-gen6, the message header uses byte offsets instead of vec4 1440 * (16-byte) offset units. 1441 */ 1442 if (devinfo->gen < 6) 1443 message_header_scale *= 16; 1444 1445 if (reladdr) { 1446 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have 1447 * to multiply the reladdr by 2. Notice that the reg_offset part 1448 * is in units of 16 bytes and is used to select the low/high 16-byte 1449 * chunk of a full dvec4, so we don't want to multiply that part. 1450 */ 1451 src_reg index = src_reg(this, glsl_type::int_type); 1452 if (type_sz(inst->dst.type) < 8) { 1453 emit_before(block, inst, ADD(dst_reg(index), *reladdr, 1454 brw_imm_d(reg_offset))); 1455 emit_before(block, inst, MUL(dst_reg(index), index, 1456 brw_imm_d(message_header_scale))); 1457 } else { 1458 emit_before(block, inst, MUL(dst_reg(index), *reladdr, 1459 brw_imm_d(message_header_scale * 2))); 1460 emit_before(block, inst, ADD(dst_reg(index), index, 1461 brw_imm_d(reg_offset * message_header_scale))); 1462 } 1463 return index; 1464 } else { 1465 return brw_imm_d(reg_offset * message_header_scale); 1466 } 1467 } 1468 1469 /** 1470 * Emits an instruction before @inst to load the value named by @orig_src 1471 * from scratch space at @base_offset to @temp. 1472 * 1473 * @base_offset is measured in 32-byte units (the size of a register). 1474 */ 1475 void 1476 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst, 1477 dst_reg temp, src_reg orig_src, 1478 int base_offset) 1479 { 1480 assert(orig_src.offset % REG_SIZE == 0); 1481 int reg_offset = base_offset + orig_src.offset / REG_SIZE; 1482 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, 1483 reg_offset); 1484 1485 if (type_sz(orig_src.type) < 8) { 1486 emit_before(block, inst, SCRATCH_READ(temp, index)); 1487 } else { 1488 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); 1489 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F); 1490 emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); 1491 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); 1492 vec4_instruction *last_read = 1493 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); 1494 emit_before(block, inst, last_read); 1495 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read); 1496 } 1497 } 1498 1499 /** 1500 * Emits an instruction after @inst to store the value to be written 1501 * to @orig_dst to scratch space at @base_offset, from @temp. 1502 * 1503 * @base_offset is measured in 32-byte units (the size of a register). 1504 */ 1505 void 1506 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, 1507 int base_offset) 1508 { 1509 assert(inst->dst.offset % REG_SIZE == 0); 1510 int reg_offset = base_offset + inst->dst.offset / REG_SIZE; 1511 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1512 reg_offset); 1513 1514 /* Create a temporary register to store *inst's result in. 1515 * 1516 * We have to be careful in MOVing from our temporary result register in 1517 * the scratch write. If we swizzle from channels of the temporary that 1518 * weren't initialized, it will confuse live interval analysis, which will 1519 * make spilling fail to make progress. 1520 */ 1521 bool is_64bit = type_sz(inst->dst.type) == 8; 1522 const glsl_type *alloc_type = 1523 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type; 1524 const src_reg temp = swizzle(retype(src_reg(this, alloc_type), 1525 inst->dst.type), 1526 brw_swizzle_for_mask(inst->dst.writemask)); 1527 1528 if (!is_64bit) { 1529 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), 1530 inst->dst.writemask)); 1531 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); 1532 if (inst->opcode != BRW_OPCODE_SEL) 1533 write->predicate = inst->predicate; 1534 write->ir = inst->ir; 1535 write->annotation = inst->annotation; 1536 inst->insert_after(block, write); 1537 } else { 1538 dst_reg shuffled = dst_reg(this, alloc_type); 1539 vec4_instruction *last = 1540 shuffle_64bit_data(shuffled, temp, true, block, inst); 1541 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); 1542 1543 uint8_t mask = 0; 1544 if (inst->dst.writemask & WRITEMASK_X) 1545 mask |= WRITEMASK_XY; 1546 if (inst->dst.writemask & WRITEMASK_Y) 1547 mask |= WRITEMASK_ZW; 1548 if (mask) { 1549 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1550 1551 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); 1552 if (inst->opcode != BRW_OPCODE_SEL) 1553 write->predicate = inst->predicate; 1554 write->ir = inst->ir; 1555 write->annotation = inst->annotation; 1556 last->insert_after(block, write); 1557 } 1558 1559 mask = 0; 1560 if (inst->dst.writemask & WRITEMASK_Z) 1561 mask |= WRITEMASK_XY; 1562 if (inst->dst.writemask & WRITEMASK_W) 1563 mask |= WRITEMASK_ZW; 1564 if (mask) { 1565 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1566 1567 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1568 reg_offset + 1); 1569 vec4_instruction *write = 1570 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); 1571 if (inst->opcode != BRW_OPCODE_SEL) 1572 write->predicate = inst->predicate; 1573 write->ir = inst->ir; 1574 write->annotation = inst->annotation; 1575 last->insert_after(block, write); 1576 } 1577 } 1578 1579 inst->dst.file = temp.file; 1580 inst->dst.nr = temp.nr; 1581 inst->dst.offset %= REG_SIZE; 1582 inst->dst.reladdr = NULL; 1583 } 1584 1585 /** 1586 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, 1587 * adds the scratch read(s) before \p inst. The function also checks for 1588 * recursive reladdr scratch accesses, issuing the corresponding scratch 1589 * loads and rewriting reladdr references accordingly. 1590 * 1591 * \return \p src if it did not require a scratch load, otherwise, the 1592 * register holding the result of the scratch load that the caller should 1593 * use to rewrite src. 1594 */ 1595 src_reg 1596 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, 1597 vec4_instruction *inst, src_reg src) 1598 { 1599 /* Resolve recursive reladdr scratch access by calling ourselves 1600 * with src.reladdr 1601 */ 1602 if (src.reladdr) 1603 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1604 *src.reladdr); 1605 1606 /* Now handle scratch access on src */ 1607 if (src.file == VGRF && scratch_loc[src.nr] != -1) { 1608 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? 1609 glsl_type::dvec4_type : glsl_type::vec4_type); 1610 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); 1611 src.nr = temp.nr; 1612 src.offset %= REG_SIZE; 1613 src.reladdr = NULL; 1614 } 1615 1616 return src; 1617 } 1618 1619 /** 1620 * We can't generally support array access in GRF space, because a 1621 * single instruction's destination can only span 2 contiguous 1622 * registers. So, we send all GRF arrays that get variable index 1623 * access to scratch space. 1624 */ 1625 void 1626 vec4_visitor::move_grf_array_access_to_scratch() 1627 { 1628 int scratch_loc[this->alloc.count]; 1629 memset(scratch_loc, -1, sizeof(scratch_loc)); 1630 1631 /* First, calculate the set of virtual GRFs that need to be punted 1632 * to scratch due to having any array access on them, and where in 1633 * scratch. 1634 */ 1635 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1636 if (inst->dst.file == VGRF && inst->dst.reladdr) { 1637 if (scratch_loc[inst->dst.nr] == -1) { 1638 scratch_loc[inst->dst.nr] = last_scratch; 1639 last_scratch += this->alloc.sizes[inst->dst.nr]; 1640 } 1641 1642 for (src_reg *iter = inst->dst.reladdr; 1643 iter->reladdr; 1644 iter = iter->reladdr) { 1645 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1646 scratch_loc[iter->nr] = last_scratch; 1647 last_scratch += this->alloc.sizes[iter->nr]; 1648 } 1649 } 1650 } 1651 1652 for (int i = 0 ; i < 3; i++) { 1653 for (src_reg *iter = &inst->src[i]; 1654 iter->reladdr; 1655 iter = iter->reladdr) { 1656 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1657 scratch_loc[iter->nr] = last_scratch; 1658 last_scratch += this->alloc.sizes[iter->nr]; 1659 } 1660 } 1661 } 1662 } 1663 1664 /* Now, for anything that will be accessed through scratch, rewrite 1665 * it to load/store. Note that this is a _safe list walk, because 1666 * we may generate a new scratch_write instruction after the one 1667 * we're processing. 1668 */ 1669 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1670 /* Set up the annotation tracking for new generated instructions. */ 1671 base_ir = inst->ir; 1672 current_annotation = inst->annotation; 1673 1674 /* First handle scratch access on the dst. Notice we have to handle 1675 * the case where the dst's reladdr also points to scratch space. 1676 */ 1677 if (inst->dst.reladdr) 1678 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1679 *inst->dst.reladdr); 1680 1681 /* Now that we have handled any (possibly recursive) reladdr scratch 1682 * accesses for dst we can safely do the scratch write for dst itself 1683 */ 1684 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) 1685 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); 1686 1687 /* Now handle scratch access on any src. In this case, since inst->src[i] 1688 * already is a src_reg, we can just call emit_resolve_reladdr with 1689 * inst->src[i] and it will take care of handling scratch loads for 1690 * both src and src.reladdr (recursively). 1691 */ 1692 for (int i = 0 ; i < 3; i++) { 1693 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, 1694 inst->src[i]); 1695 } 1696 } 1697 } 1698 1699 /** 1700 * Emits an instruction before @inst to load the value named by @orig_src 1701 * from the pull constant buffer (surface) at @base_offset to @temp. 1702 */ 1703 void 1704 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, 1705 dst_reg temp, src_reg orig_src, 1706 int base_offset, src_reg indirect) 1707 { 1708 assert(orig_src.offset % 16 == 0); 1709 const unsigned index = prog_data->base.binding_table.pull_constants_start; 1710 1711 /* For 64bit loads we need to emit two 32-bit load messages and we also 1712 * we need to shuffle the 32-bit data result into proper 64-bit data. To do 1713 * that we emit the 32-bit loads into a temporary and we shuffle the result 1714 * into the original destination. 1715 */ 1716 dst_reg orig_temp = temp; 1717 bool is_64bit = type_sz(orig_src.type) == 8; 1718 if (is_64bit) { 1719 assert(type_sz(temp.type) == 8); 1720 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type); 1721 temp = retype(temp_df, BRW_REGISTER_TYPE_F); 1722 } 1723 1724 src_reg src = orig_src; 1725 for (int i = 0; i < (is_64bit ? 2 : 1); i++) { 1726 int reg_offset = base_offset + src.offset / 16; 1727 1728 src_reg offset; 1729 if (indirect.file != BAD_FILE) { 1730 offset = src_reg(this, glsl_type::uint_type); 1731 emit_before(block, inst, ADD(dst_reg(offset), indirect, 1732 brw_imm_ud(reg_offset * 16))); 1733 } else if (devinfo->gen >= 8) { 1734 /* Store the offset in a GRF so we can send-from-GRF. */ 1735 offset = src_reg(this, glsl_type::uint_type); 1736 emit_before(block, inst, MOV(dst_reg(offset), 1737 brw_imm_ud(reg_offset * 16))); 1738 } else { 1739 offset = brw_imm_d(reg_offset * 16); 1740 } 1741 1742 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE), 1743 brw_imm_ud(index), 1744 offset, 1745 block, inst); 1746 1747 src = byte_offset(src, 16); 1748 } 1749 1750 brw_mark_surface_used(&prog_data->base, index); 1751 1752 if (is_64bit) { 1753 temp = retype(temp, BRW_REGISTER_TYPE_DF); 1754 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst); 1755 } 1756 } 1757 1758 /** 1759 * Implements array access of uniforms by inserting a 1760 * PULL_CONSTANT_LOAD instruction. 1761 * 1762 * Unlike temporary GRF array access (where we don't support it due to 1763 * the difficulty of doing relative addressing on instruction 1764 * destinations), we could potentially do array access of uniforms 1765 * that were loaded in GRF space as push constants. In real-world 1766 * usage we've seen, though, the arrays being used are always larger 1767 * than we could load as push constants, so just always move all 1768 * uniform array access out to a pull constant buffer. 1769 */ 1770 void 1771 vec4_visitor::move_uniform_array_access_to_pull_constants() 1772 { 1773 /* The vulkan dirver doesn't support pull constants other than UBOs so 1774 * everything has to be pushed regardless. 1775 */ 1776 if (stage_prog_data->pull_param == NULL) { 1777 split_uniform_registers(); 1778 return; 1779 } 1780 1781 int pull_constant_loc[this->uniforms]; 1782 memset(pull_constant_loc, -1, sizeof(pull_constant_loc)); 1783 1784 /* First, walk through the instructions and determine which things need to 1785 * be pulled. We mark something as needing to be pulled by setting 1786 * pull_constant_loc to 0. 1787 */ 1788 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1789 /* We only care about MOV_INDIRECT of a uniform */ 1790 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1791 inst->src[0].file != UNIFORM) 1792 continue; 1793 1794 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1795 1796 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++) 1797 pull_constant_loc[uniform_nr + j] = 0; 1798 } 1799 1800 /* Next, we walk the list of uniforms and assign real pull constant 1801 * locations and set their corresponding entries in pull_param. 1802 */ 1803 for (int j = 0; j < this->uniforms; j++) { 1804 if (pull_constant_loc[j] < 0) 1805 continue; 1806 1807 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4; 1808 1809 for (int i = 0; i < 4; i++) { 1810 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] 1811 = stage_prog_data->param[j * 4 + i]; 1812 } 1813 } 1814 1815 /* Finally, we can walk through the instructions and lower MOV_INDIRECT 1816 * instructions to actual uniform pulls. 1817 */ 1818 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1819 /* We only care about MOV_INDIRECT of a uniform */ 1820 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1821 inst->src[0].file != UNIFORM) 1822 continue; 1823 1824 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1825 1826 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP); 1827 1828 emit_pull_constant_load(block, inst, inst->dst, inst->src[0], 1829 pull_constant_loc[uniform_nr], inst->src[1]); 1830 inst->remove(block); 1831 } 1832 1833 /* Now there are no accesses of the UNIFORM file with a reladdr, so 1834 * no need to track them as larger-than-vec4 objects. This will be 1835 * relied on in cutting out unused uniform vectors from push 1836 * constants. 1837 */ 1838 split_uniform_registers(); 1839 } 1840 1841 void 1842 vec4_visitor::resolve_ud_negate(src_reg *reg) 1843 { 1844 if (reg->type != BRW_REGISTER_TYPE_UD || 1845 !reg->negate) 1846 return; 1847 1848 src_reg temp = src_reg(this, glsl_type::uvec4_type); 1849 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg); 1850 *reg = temp; 1851 } 1852 1853 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, 1854 void *log_data, 1855 const struct brw_sampler_prog_key_data *key_tex, 1856 struct brw_vue_prog_data *prog_data, 1857 const nir_shader *shader, 1858 void *mem_ctx, 1859 bool no_spills, 1860 int shader_time_index) 1861 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base), 1862 key_tex(key_tex), 1863 prog_data(prog_data), 1864 fail_msg(NULL), 1865 first_non_payload_grf(0), 1866 need_all_constants_in_pull_buffer(false), 1867 no_spills(no_spills), 1868 shader_time_index(shader_time_index), 1869 last_scratch(0) 1870 { 1871 this->failed = false; 1872 1873 this->base_ir = NULL; 1874 this->current_annotation = NULL; 1875 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); 1876 1877 memset(this->output_num_components, 0, sizeof(this->output_num_components)); 1878 1879 this->virtual_grf_start = NULL; 1880 this->virtual_grf_end = NULL; 1881 this->live_intervals = NULL; 1882 1883 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; 1884 1885 this->uniforms = 0; 1886 } 1887 1888 vec4_visitor::~vec4_visitor() 1889 { 1890 } 1891 1892 1893 void 1894 vec4_visitor::fail(const char *format, ...) 1895 { 1896 va_list va; 1897 char *msg; 1898 1899 if (failed) 1900 return; 1901 1902 failed = true; 1903 1904 va_start(va, format); 1905 msg = ralloc_vasprintf(mem_ctx, format, va); 1906 va_end(va); 1907 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); 1908 1909 this->fail_msg = msg; 1910 1911 if (debug_enabled) { 1912 fprintf(stderr, "%s", msg); 1913 } 1914 } 1915 1916 } /* namespace brw */ 1917