1 /* Copyright 2011 Intel Corporation 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a 4 * copy of this software and associated documentation files (the "Software"), 5 * to deal in the Software without restriction, including without limitation 6 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 * and/or sell copies of the Software, and to permit persons to whom the 8 * Software is furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice (including the next 11 * paragraph) shall be included in all copies or substantial portions of the 12 * Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20 * IN THE SOFTWARE. 21 */ 22 23 #include "brw_vec4.h" 24 #include "brw_cfg.h" 25 #include "brw_eu.h" 26 #include "brw_program.h" 27 28 using namespace brw; 29 30 static void 31 generate_math1_gen4(struct brw_codegen *p, 32 vec4_instruction *inst, 33 struct brw_reg dst, 34 struct brw_reg src) 35 { 36 gen4_math(p, 37 dst, 38 brw_math_function(inst->opcode), 39 inst->base_mrf, 40 src, 41 BRW_MATH_PRECISION_FULL); 42 } 43 44 static void 45 check_gen6_math_src_arg(struct brw_reg src) 46 { 47 /* Source swizzles are ignored. */ 48 assert(!src.abs); 49 assert(!src.negate); 50 assert(src.swizzle == BRW_SWIZZLE_XYZW); 51 } 52 53 static void 54 generate_math_gen6(struct brw_codegen *p, 55 vec4_instruction *inst, 56 struct brw_reg dst, 57 struct brw_reg src0, 58 struct brw_reg src1) 59 { 60 /* Can't do writemask because math can't be align16. */ 61 assert(dst.writemask == WRITEMASK_XYZW); 62 /* Source swizzles are ignored. */ 63 check_gen6_math_src_arg(src0); 64 if (src1.file == BRW_GENERAL_REGISTER_FILE) 65 check_gen6_math_src_arg(src1); 66 67 brw_set_default_access_mode(p, BRW_ALIGN_1); 68 gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1); 69 brw_set_default_access_mode(p, BRW_ALIGN_16); 70 } 71 72 static void 73 generate_math2_gen4(struct brw_codegen *p, 74 vec4_instruction *inst, 75 struct brw_reg dst, 76 struct brw_reg src0, 77 struct brw_reg src1) 78 { 79 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 80 * "Message Payload": 81 * 82 * "Operand0[7]. For the INT DIV functions, this operand is the 83 * denominator." 84 * ... 85 * "Operand1[7]. For the INT DIV functions, this operand is the 86 * numerator." 87 */ 88 bool is_int_div = inst->opcode != SHADER_OPCODE_POW; 89 struct brw_reg &op0 = is_int_div ? src1 : src0; 90 struct brw_reg &op1 = is_int_div ? src0 : src1; 91 92 brw_push_insn_state(p); 93 brw_set_default_saturate(p, false); 94 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 95 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1); 96 brw_pop_insn_state(p); 97 98 gen4_math(p, 99 dst, 100 brw_math_function(inst->opcode), 101 inst->base_mrf, 102 op0, 103 BRW_MATH_PRECISION_FULL); 104 } 105 106 static void 107 generate_tex(struct brw_codegen *p, 108 struct brw_vue_prog_data *prog_data, 109 gl_shader_stage stage, 110 vec4_instruction *inst, 111 struct brw_reg dst, 112 struct brw_reg src, 113 struct brw_reg surface_index, 114 struct brw_reg sampler_index) 115 { 116 const struct gen_device_info *devinfo = p->devinfo; 117 int msg_type = -1; 118 119 if (devinfo->gen >= 5) { 120 switch (inst->opcode) { 121 case SHADER_OPCODE_TEX: 122 case SHADER_OPCODE_TXL: 123 if (inst->shadow_compare) { 124 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 125 } else { 126 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 127 } 128 break; 129 case SHADER_OPCODE_TXD: 130 if (inst->shadow_compare) { 131 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */ 132 assert(devinfo->gen >= 8 || devinfo->is_haswell); 133 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; 134 } else { 135 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 136 } 137 break; 138 case SHADER_OPCODE_TXF: 139 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 140 break; 141 case SHADER_OPCODE_TXF_CMS_W: 142 assert(devinfo->gen >= 9); 143 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; 144 break; 145 case SHADER_OPCODE_TXF_CMS: 146 if (devinfo->gen >= 7) 147 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; 148 else 149 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 150 break; 151 case SHADER_OPCODE_TXF_MCS: 152 assert(devinfo->gen >= 7); 153 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; 154 break; 155 case SHADER_OPCODE_TXS: 156 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 157 break; 158 case SHADER_OPCODE_TG4: 159 if (inst->shadow_compare) { 160 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; 161 } else { 162 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 163 } 164 break; 165 case SHADER_OPCODE_TG4_OFFSET: 166 if (inst->shadow_compare) { 167 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; 168 } else { 169 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; 170 } 171 break; 172 case SHADER_OPCODE_SAMPLEINFO: 173 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 174 break; 175 default: 176 unreachable("should not get here: invalid vec4 texture opcode"); 177 } 178 } else { 179 switch (inst->opcode) { 180 case SHADER_OPCODE_TEX: 181 case SHADER_OPCODE_TXL: 182 if (inst->shadow_compare) { 183 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE; 184 assert(inst->mlen == 3); 185 } else { 186 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD; 187 assert(inst->mlen == 2); 188 } 189 break; 190 case SHADER_OPCODE_TXD: 191 /* There is no sample_d_c message; comparisons are done manually. */ 192 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS; 193 assert(inst->mlen == 4); 194 break; 195 case SHADER_OPCODE_TXF: 196 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD; 197 assert(inst->mlen == 2); 198 break; 199 case SHADER_OPCODE_TXS: 200 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO; 201 assert(inst->mlen == 2); 202 break; 203 default: 204 unreachable("should not get here: invalid vec4 texture opcode"); 205 } 206 } 207 208 assert(msg_type != -1); 209 210 assert(sampler_index.type == BRW_REGISTER_TYPE_UD); 211 212 /* Load the message header if present. If there's a texture offset, we need 213 * to set it up explicitly and load the offset bitfield. Otherwise, we can 214 * use an implied move from g0 to the first message register. 215 */ 216 if (inst->header_size != 0) { 217 if (devinfo->gen < 6 && !inst->offset) { 218 /* Set up an implied move from g0 to the MRF. */ 219 src = brw_vec8_grf(0, 0); 220 } else { 221 struct brw_reg header = 222 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); 223 uint32_t dw2 = 0; 224 225 /* Explicitly set up the message header by copying g0 to the MRF. */ 226 brw_push_insn_state(p); 227 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 228 brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 229 230 brw_set_default_access_mode(p, BRW_ALIGN_1); 231 232 if (inst->offset) 233 /* Set the texel offset bits in DWord 2. */ 234 dw2 = inst->offset; 235 236 if (devinfo->gen >= 9) 237 /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D, 238 * based on bit 22 in the header. 239 */ 240 dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2; 241 242 /* The VS, DS, and FS stages have the g0.2 payload delivered as 0, 243 * so header0.2 is 0 when g0 is copied. The HS and GS stages do 244 * not, so we must set to to 0 to avoid setting undesirable bits 245 * in the message header. 246 */ 247 if (dw2 || 248 stage == MESA_SHADER_TESS_CTRL || 249 stage == MESA_SHADER_GEOMETRY) { 250 brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2)); 251 } 252 253 brw_adjust_sampler_state_pointer(p, header, sampler_index); 254 brw_pop_insn_state(p); 255 } 256 } 257 258 uint32_t return_format; 259 260 switch (dst.type) { 261 case BRW_REGISTER_TYPE_D: 262 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 263 break; 264 case BRW_REGISTER_TYPE_UD: 265 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 266 break; 267 default: 268 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 269 break; 270 } 271 272 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || 273 inst->opcode == SHADER_OPCODE_TG4_OFFSET) 274 ? prog_data->base.binding_table.gather_texture_start 275 : prog_data->base.binding_table.texture_start; 276 277 if (surface_index.file == BRW_IMMEDIATE_VALUE && 278 sampler_index.file == BRW_IMMEDIATE_VALUE) { 279 uint32_t surface = surface_index.ud; 280 uint32_t sampler = sampler_index.ud; 281 282 brw_SAMPLE(p, 283 dst, 284 inst->base_mrf, 285 src, 286 surface + base_binding_table_index, 287 sampler % 16, 288 msg_type, 289 1, /* response length */ 290 inst->mlen, 291 inst->header_size != 0, 292 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 293 return_format); 294 295 brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index); 296 } else { 297 /* Non-constant sampler index. */ 298 299 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 300 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); 301 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); 302 303 brw_push_insn_state(p); 304 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 305 brw_set_default_access_mode(p, BRW_ALIGN_1); 306 307 if (brw_regs_equal(&surface_reg, &sampler_reg)) { 308 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); 309 } else { 310 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { 311 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); 312 } else { 313 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); 314 brw_OR(p, addr, addr, surface_reg); 315 } 316 } 317 if (base_binding_table_index) 318 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); 319 brw_AND(p, addr, addr, brw_imm_ud(0xfff)); 320 321 brw_pop_insn_state(p); 322 323 if (inst->base_mrf != -1) 324 gen6_resolve_implied_move(p, &src, inst->base_mrf); 325 326 /* dst = send(offset, a0.0 | <descriptor>) */ 327 brw_inst *insn = brw_send_indirect_message( 328 p, BRW_SFID_SAMPLER, dst, src, addr); 329 brw_set_sampler_message(p, insn, 330 0 /* surface */, 331 0 /* sampler */, 332 msg_type, 333 1 /* rlen */, 334 inst->mlen /* mlen */, 335 inst->header_size != 0 /* header */, 336 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 337 return_format); 338 339 /* visitor knows more than we do about the surface limit required, 340 * so has already done marking. 341 */ 342 } 343 } 344 345 static void 346 generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst) 347 { 348 brw_urb_WRITE(p, 349 brw_null_reg(), /* dest */ 350 inst->base_mrf, /* starting mrf reg nr */ 351 brw_vec8_grf(0, 0), /* src */ 352 inst->urb_write_flags, 353 inst->mlen, 354 0, /* response len */ 355 inst->offset, /* urb destination offset */ 356 BRW_URB_SWIZZLE_INTERLEAVE); 357 } 358 359 static void 360 generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst) 361 { 362 struct brw_reg src = brw_message_reg(inst->base_mrf); 363 brw_urb_WRITE(p, 364 brw_null_reg(), /* dest */ 365 inst->base_mrf, /* starting mrf reg nr */ 366 src, 367 inst->urb_write_flags, 368 inst->mlen, 369 0, /* response len */ 370 inst->offset, /* urb destination offset */ 371 BRW_URB_SWIZZLE_INTERLEAVE); 372 } 373 374 static void 375 generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) 376 { 377 struct brw_reg src = brw_message_reg(inst->base_mrf); 378 379 /* We pass the temporary passed in src0 as the writeback register */ 380 brw_urb_WRITE(p, 381 inst->src[0].as_brw_reg(), /* dest */ 382 inst->base_mrf, /* starting mrf reg nr */ 383 src, 384 BRW_URB_WRITE_ALLOCATE_COMPLETE, 385 inst->mlen, 386 1, /* response len */ 387 inst->offset, /* urb destination offset */ 388 BRW_URB_SWIZZLE_INTERLEAVE); 389 390 /* Now put allocated urb handle in dst.0 */ 391 brw_push_insn_state(p); 392 brw_set_default_access_mode(p, BRW_ALIGN_1); 393 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 394 brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0), 395 get_element_ud(inst->src[0].as_brw_reg(), 0)); 396 brw_pop_insn_state(p); 397 } 398 399 static void 400 generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst) 401 { 402 struct brw_reg src = brw_message_reg(inst->base_mrf); 403 brw_urb_WRITE(p, 404 brw_null_reg(), /* dest */ 405 inst->base_mrf, /* starting mrf reg nr */ 406 src, 407 BRW_URB_WRITE_EOT | inst->urb_write_flags, 408 inst->mlen, 409 0, /* response len */ 410 0, /* urb destination offset */ 411 BRW_URB_SWIZZLE_INTERLEAVE); 412 } 413 414 static void 415 generate_gs_set_write_offset(struct brw_codegen *p, 416 struct brw_reg dst, 417 struct brw_reg src0, 418 struct brw_reg src1) 419 { 420 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message 421 * Header: M0.3): 422 * 423 * Slot 0 Offset. This field, after adding to the Global Offset field 424 * in the message descriptor, specifies the offset (in 256-bit units) 425 * from the start of the URB entry, as referenced by URB Handle 0, at 426 * which the data will be accessed. 427 * 428 * Similar text describes DWORD M0.4, which is slot 1 offset. 429 * 430 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components 431 * of the register for geometry shader invocations 0 and 1) by the 432 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. 433 * 434 * We can do this with the following EU instruction: 435 * 436 * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all } 437 */ 438 brw_push_insn_state(p); 439 brw_set_default_access_mode(p, BRW_ALIGN_1); 440 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 441 assert(p->devinfo->gen >= 7 && 442 src1.file == BRW_IMMEDIATE_VALUE && 443 src1.type == BRW_REGISTER_TYPE_UD && 444 src1.ud <= USHRT_MAX); 445 if (src0.file == BRW_IMMEDIATE_VALUE) { 446 brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3), 447 brw_imm_ud(src0.ud * src1.ud)); 448 } else { 449 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), 450 retype(src1, BRW_REGISTER_TYPE_UW)); 451 } 452 brw_pop_insn_state(p); 453 } 454 455 static void 456 generate_gs_set_vertex_count(struct brw_codegen *p, 457 struct brw_reg dst, 458 struct brw_reg src) 459 { 460 brw_push_insn_state(p); 461 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 462 463 if (p->devinfo->gen >= 8) { 464 /* Move the vertex count into the second MRF for the EOT write. */ 465 brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD), 466 src); 467 } else { 468 /* If we think of the src and dst registers as composed of 8 DWORDs each, 469 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate 470 * them to WORDs, and then pack them into DWORD 2 of dst. 471 * 472 * It's easier to get the EU to do this if we think of the src and dst 473 * registers as composed of 16 WORDS each; then, we want to pick up the 474 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 475 * of dst. 476 * 477 * We can do that by the following EU instruction: 478 * 479 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } 480 */ 481 brw_set_default_access_mode(p, BRW_ALIGN_1); 482 brw_MOV(p, 483 suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), 484 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); 485 } 486 brw_pop_insn_state(p); 487 } 488 489 static void 490 generate_gs_svb_write(struct brw_codegen *p, 491 struct brw_vue_prog_data *prog_data, 492 vec4_instruction *inst, 493 struct brw_reg dst, 494 struct brw_reg src0, 495 struct brw_reg src1) 496 { 497 int binding = inst->sol_binding; 498 bool final_write = inst->sol_final_write; 499 500 brw_push_insn_state(p); 501 brw_set_default_exec_size(p, BRW_EXECUTE_4); 502 /* Copy Vertex data into M0.x */ 503 brw_MOV(p, stride(dst, 4, 4, 1), 504 stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1)); 505 brw_pop_insn_state(p); 506 507 brw_push_insn_state(p); 508 /* Send SVB Write */ 509 brw_svb_write(p, 510 final_write ? src1 : brw_null_reg(), /* dest == src1 */ 511 1, /* msg_reg_nr */ 512 dst, /* src0 == previous dst */ 513 SURF_INDEX_GEN6_SOL_BINDING(binding), /* binding_table_index */ 514 final_write); /* send_commit_msg */ 515 516 /* Finally, wait for the write commit to occur so that we can proceed to 517 * other things safely. 518 * 519 * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: 520 * 521 * The write commit does not modify the destination register, but 522 * merely clears the dependency associated with the destination 523 * register. Thus, a simple mov instruction using the register as a 524 * source is sufficient to wait for the write commit to occur. 525 */ 526 if (final_write) { 527 brw_MOV(p, src1, src1); 528 } 529 brw_pop_insn_state(p); 530 } 531 532 static void 533 generate_gs_svb_set_destination_index(struct brw_codegen *p, 534 vec4_instruction *inst, 535 struct brw_reg dst, 536 struct brw_reg src) 537 { 538 int vertex = inst->sol_vertex; 539 brw_push_insn_state(p); 540 brw_set_default_access_mode(p, BRW_ALIGN_1); 541 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 542 brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex)); 543 brw_pop_insn_state(p); 544 } 545 546 static void 547 generate_gs_set_dword_2(struct brw_codegen *p, 548 struct brw_reg dst, 549 struct brw_reg src) 550 { 551 brw_push_insn_state(p); 552 brw_set_default_access_mode(p, BRW_ALIGN_1); 553 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 554 brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0)); 555 brw_pop_insn_state(p); 556 } 557 558 static void 559 generate_gs_prepare_channel_masks(struct brw_codegen *p, 560 struct brw_reg dst) 561 { 562 /* We want to left shift just DWORD 4 (the x component belonging to the 563 * second geometry shader invocation) by 4 bits. So generate the 564 * instruction: 565 * 566 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all } 567 */ 568 dst = suboffset(vec1(dst), 4); 569 brw_push_insn_state(p); 570 brw_set_default_access_mode(p, BRW_ALIGN_1); 571 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 572 brw_SHL(p, dst, dst, brw_imm_ud(4)); 573 brw_pop_insn_state(p); 574 } 575 576 static void 577 generate_gs_set_channel_masks(struct brw_codegen *p, 578 struct brw_reg dst, 579 struct brw_reg src) 580 { 581 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message 582 * Header: M0.5): 583 * 584 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask 585 * 586 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1 587 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls 588 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding 589 * channel enable to determine the final channel enable. For the 590 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel 591 * enable is 1 it indicates that Vertex 1 DATA [3] will be included 592 * in the writeback message. For the URB_WRITE_OWORD & 593 * URB_WRITE_HWORD messages, when final channel enable is 1 it 594 * indicates that Vertex 1 DATA [3] will be written to the surface. 595 * 596 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included 597 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included 598 * 599 * 14 Vertex 1 DATA [2] Channel Mask 600 * 13 Vertex 1 DATA [1] Channel Mask 601 * 12 Vertex 1 DATA [0] Channel Mask 602 * 11 Vertex 0 DATA [3] Channel Mask 603 * 10 Vertex 0 DATA [2] Channel Mask 604 * 9 Vertex 0 DATA [1] Channel Mask 605 * 8 Vertex 0 DATA [0] Channel Mask 606 * 607 * (This is from a section of the PRM that is agnostic to the particular 608 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to 609 * geometry shader invocations 0 and 1, respectively). Since we have the 610 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0, 611 * and the enable flags for geometry shader invocation 1 in bits 7:0 of 612 * DWORD 4, we just need to OR them together and store the result in bits 613 * 15:8 of DWORD 5. 614 * 615 * It's easier to get the EU to do this if we think of the src and dst 616 * registers as composed of 32 bytes each; then, we want to pick up the 617 * contents of bytes 0 and 16 from src, OR them together, and store them in 618 * byte 21. 619 * 620 * We can do that by the following EU instruction: 621 * 622 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all } 623 * 624 * Note: this relies on the source register having zeros in (a) bits 7:4 of 625 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the 626 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which 627 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to 628 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to 629 * contain valid channel mask values (which are in the range 0x0-0xf). 630 */ 631 dst = retype(dst, BRW_REGISTER_TYPE_UB); 632 src = retype(src, BRW_REGISTER_TYPE_UB); 633 brw_push_insn_state(p); 634 brw_set_default_access_mode(p, BRW_ALIGN_1); 635 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 636 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16)); 637 brw_pop_insn_state(p); 638 } 639 640 static void 641 generate_gs_get_instance_id(struct brw_codegen *p, 642 struct brw_reg dst) 643 { 644 /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT 645 * and store into dst.0 & dst.4. So generate the instruction: 646 * 647 * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q } 648 */ 649 brw_push_insn_state(p); 650 brw_set_default_access_mode(p, BRW_ALIGN_1); 651 dst = retype(dst, BRW_REGISTER_TYPE_UD); 652 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 653 brw_SHR(p, dst, stride(r0, 1, 4, 0), 654 brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT)); 655 brw_pop_insn_state(p); 656 } 657 658 static void 659 generate_gs_ff_sync_set_primitives(struct brw_codegen *p, 660 struct brw_reg dst, 661 struct brw_reg src0, 662 struct brw_reg src1, 663 struct brw_reg src2) 664 { 665 brw_push_insn_state(p); 666 brw_set_default_access_mode(p, BRW_ALIGN_1); 667 /* Save src0 data in 16:31 bits of dst.0 */ 668 brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0), 669 brw_imm_ud(0xffffu)); 670 brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16)); 671 /* Save src1 data in 0:15 bits of dst.0 */ 672 brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0), 673 brw_imm_ud(0xffffu)); 674 brw_OR(p, suboffset(vec1(dst), 0), 675 suboffset(vec1(dst), 0), 676 suboffset(vec1(src2), 0)); 677 brw_pop_insn_state(p); 678 } 679 680 static void 681 generate_gs_ff_sync(struct brw_codegen *p, 682 vec4_instruction *inst, 683 struct brw_reg dst, 684 struct brw_reg src0, 685 struct brw_reg src1) 686 { 687 /* This opcode uses an implied MRF register for: 688 * - the header of the ff_sync message. And as such it is expected to be 689 * initialized to r0 before calling here. 690 * - the destination where we will write the allocated URB handle. 691 */ 692 struct brw_reg header = 693 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); 694 695 /* Overwrite dword 0 of the header (SO vertices to write) and 696 * dword 1 (number of primitives written). 697 */ 698 brw_push_insn_state(p); 699 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 700 brw_set_default_access_mode(p, BRW_ALIGN_1); 701 brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0)); 702 brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0)); 703 brw_pop_insn_state(p); 704 705 /* Allocate URB handle in dst */ 706 brw_ff_sync(p, 707 dst, 708 0, 709 header, 710 1, /* allocate */ 711 1, /* response length */ 712 0 /* eot */); 713 714 /* Now put allocated urb handle in header.0 */ 715 brw_push_insn_state(p); 716 brw_set_default_access_mode(p, BRW_ALIGN_1); 717 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 718 brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0)); 719 720 /* src1 is not an immediate when we use transform feedback */ 721 if (src1.file != BRW_IMMEDIATE_VALUE) { 722 brw_set_default_exec_size(p, BRW_EXECUTE_4); 723 brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1)); 724 } 725 726 brw_pop_insn_state(p); 727 } 728 729 static void 730 generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) 731 { 732 /* In gen6, PrimitiveID is delivered in R0.1 of the payload */ 733 struct brw_reg src = brw_vec8_grf(0, 0); 734 brw_push_insn_state(p); 735 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 736 brw_set_default_access_mode(p, BRW_ALIGN_1); 737 brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1)); 738 brw_pop_insn_state(p); 739 } 740 741 static void 742 generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) 743 { 744 const struct gen_device_info *devinfo = p->devinfo; 745 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; 746 747 /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. 748 * 749 * Since we operate in SIMD4x2 mode, we need run half as many threads 750 * as necessary. So we assign (2i + 1, 2i) as the thread counts. We 751 * shift right by one less to accomplish the multiplication by two. 752 */ 753 dst = retype(dst, BRW_REGISTER_TYPE_UD); 754 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 755 756 brw_push_insn_state(p); 757 brw_set_default_access_mode(p, BRW_ALIGN_1); 758 759 const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17); 760 const int shift = ivb ? 16 : 17; 761 762 brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask)); 763 brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), 764 brw_imm_ud(shift - 1)); 765 brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); 766 767 brw_pop_insn_state(p); 768 } 769 770 static void 771 generate_tcs_urb_write(struct brw_codegen *p, 772 vec4_instruction *inst, 773 struct brw_reg urb_header) 774 { 775 const struct gen_device_info *devinfo = p->devinfo; 776 777 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 778 brw_set_dest(p, send, brw_null_reg()); 779 brw_set_src0(p, send, urb_header); 780 781 brw_set_message_descriptor(p, send, BRW_SFID_URB, 782 inst->mlen /* mlen */, 0 /* rlen */, 783 true /* header */, false /* eot */); 784 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); 785 brw_inst_set_urb_global_offset(devinfo, send, inst->offset); 786 if (inst->urb_write_flags & BRW_URB_WRITE_EOT) { 787 brw_inst_set_eot(devinfo, send, 1); 788 } else { 789 brw_inst_set_urb_per_slot_offset(devinfo, send, 1); 790 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); 791 } 792 793 /* what happens to swizzles? */ 794 } 795 796 797 static void 798 generate_tcs_input_urb_offsets(struct brw_codegen *p, 799 struct brw_reg dst, 800 struct brw_reg vertex, 801 struct brw_reg offset) 802 { 803 /* Generates an URB read/write message header for HS/DS operation. 804 * Inputs are a vertex index, and a byte offset from the beginning of 805 * the vertex. */ 806 807 /* If `vertex` is not an immediate, we clobber a0.0 */ 808 809 assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE); 810 assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D); 811 812 assert(dst.file == BRW_GENERAL_REGISTER_FILE); 813 814 brw_push_insn_state(p); 815 brw_set_default_access_mode(p, BRW_ALIGN_1); 816 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 817 brw_MOV(p, dst, brw_imm_ud(0)); 818 819 /* m0.5 bits 8-15 are channel enables */ 820 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); 821 822 /* m0.0-0.1: URB handles */ 823 if (vertex.file == BRW_IMMEDIATE_VALUE) { 824 uint32_t vertex_index = vertex.ud; 825 struct brw_reg index_reg = brw_vec1_grf( 826 1 + (vertex_index >> 3), vertex_index & 7); 827 828 brw_MOV(p, vec2(get_element_ud(dst, 0)), 829 retype(index_reg, BRW_REGISTER_TYPE_UD)); 830 } else { 831 /* Use indirect addressing. ICP Handles are DWords (single channels 832 * of a register) and start at g1.0. 833 * 834 * In order to start our region at g1.0, we add 8 to the vertex index, 835 * effectively skipping over the 8 channels in g0.0. This gives us a 836 * DWord offset to the ICP Handle. 837 * 838 * Indirect addressing works in terms of bytes, so we then multiply 839 * the DWord offset by 4 (by shifting left by 2). 840 */ 841 struct brw_reg addr = brw_address_reg(0); 842 843 /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ 844 brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8)); 845 brw_SHL(p, addr, addr, brw_imm_ud(2)); 846 brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); 847 848 /* top half: m0.1 = g[1.0 + vertex.4]UD */ 849 brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8)); 850 brw_SHL(p, addr, addr, brw_imm_ud(2)); 851 brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); 852 } 853 854 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ 855 if (offset.file != ARF) 856 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 857 858 brw_pop_insn_state(p); 859 } 860 861 862 static void 863 generate_tcs_output_urb_offsets(struct brw_codegen *p, 864 struct brw_reg dst, 865 struct brw_reg write_mask, 866 struct brw_reg offset) 867 { 868 /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */ 869 assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE); 870 871 assert(write_mask.file == BRW_IMMEDIATE_VALUE); 872 assert(write_mask.type == BRW_REGISTER_TYPE_UD); 873 874 brw_push_insn_state(p); 875 876 brw_set_default_access_mode(p, BRW_ALIGN_1); 877 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 878 brw_MOV(p, dst, brw_imm_ud(0)); 879 880 unsigned mask = write_mask.ud; 881 882 /* m0.5 bits 15:12 and 11:8 are channel enables */ 883 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12))); 884 885 /* HS patch URB handle is delivered in r0.0 */ 886 struct brw_reg urb_handle = brw_vec1_grf(0, 0); 887 888 /* m0.0-0.1: URB handles */ 889 brw_MOV(p, vec2(get_element_ud(dst, 0)), 890 retype(urb_handle, BRW_REGISTER_TYPE_UD)); 891 892 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ 893 if (offset.file != ARF) 894 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 895 896 brw_pop_insn_state(p); 897 } 898 899 static void 900 generate_tes_create_input_read_header(struct brw_codegen *p, 901 struct brw_reg dst) 902 { 903 brw_push_insn_state(p); 904 brw_set_default_access_mode(p, BRW_ALIGN_1); 905 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 906 907 /* Initialize the register to 0 */ 908 brw_MOV(p, dst, brw_imm_ud(0)); 909 910 /* Enable all the channels in m0.5 bits 15:8 */ 911 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); 912 913 /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety, 914 * mask out irrelevant "Reserved" bits, as they're not marked MBZ. 915 */ 916 brw_AND(p, vec2(get_element_ud(dst, 0)), 917 retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD), 918 brw_imm_ud(0x1fff)); 919 brw_pop_insn_state(p); 920 } 921 922 static void 923 generate_tes_add_indirect_urb_offset(struct brw_codegen *p, 924 struct brw_reg dst, 925 struct brw_reg header, 926 struct brw_reg offset) 927 { 928 brw_push_insn_state(p); 929 brw_set_default_access_mode(p, BRW_ALIGN_1); 930 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 931 932 brw_MOV(p, dst, header); 933 /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */ 934 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 935 936 brw_pop_insn_state(p); 937 } 938 939 static void 940 generate_vec4_urb_read(struct brw_codegen *p, 941 vec4_instruction *inst, 942 struct brw_reg dst, 943 struct brw_reg header) 944 { 945 const struct gen_device_info *devinfo = p->devinfo; 946 947 assert(header.file == BRW_GENERAL_REGISTER_FILE); 948 assert(header.type == BRW_REGISTER_TYPE_UD); 949 950 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 951 brw_set_dest(p, send, dst); 952 brw_set_src0(p, send, header); 953 954 brw_set_message_descriptor(p, send, BRW_SFID_URB, 955 1 /* mlen */, 1 /* rlen */, 956 true /* header */, false /* eot */); 957 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); 958 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); 959 brw_inst_set_urb_per_slot_offset(devinfo, send, 1); 960 961 brw_inst_set_urb_global_offset(devinfo, send, inst->offset); 962 } 963 964 static void 965 generate_tcs_release_input(struct brw_codegen *p, 966 struct brw_reg header, 967 struct brw_reg vertex, 968 struct brw_reg is_unpaired) 969 { 970 const struct gen_device_info *devinfo = p->devinfo; 971 972 assert(vertex.file == BRW_IMMEDIATE_VALUE); 973 assert(vertex.type == BRW_REGISTER_TYPE_UD); 974 975 /* m0.0-0.1: URB handles */ 976 struct brw_reg urb_handles = 977 retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7), 978 BRW_REGISTER_TYPE_UD); 979 980 brw_push_insn_state(p); 981 brw_set_default_access_mode(p, BRW_ALIGN_1); 982 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 983 brw_MOV(p, header, brw_imm_ud(0)); 984 brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles); 985 brw_pop_insn_state(p); 986 987 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 988 brw_set_dest(p, send, brw_null_reg()); 989 brw_set_src0(p, send, header); 990 brw_set_message_descriptor(p, send, BRW_SFID_URB, 991 1 /* mlen */, 0 /* rlen */, 992 true /* header */, false /* eot */); 993 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); 994 brw_inst_set_urb_complete(devinfo, send, 1); 995 brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ? 996 BRW_URB_SWIZZLE_NONE : 997 BRW_URB_SWIZZLE_INTERLEAVE); 998 } 999 1000 static void 1001 generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst) 1002 { 1003 struct brw_reg header = brw_message_reg(inst->base_mrf); 1004 1005 brw_push_insn_state(p); 1006 brw_set_default_access_mode(p, BRW_ALIGN_1); 1007 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1008 brw_MOV(p, header, brw_imm_ud(0)); 1009 brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8)); 1010 brw_MOV(p, get_element_ud(header, 0), 1011 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1012 brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u)); 1013 brw_pop_insn_state(p); 1014 1015 brw_urb_WRITE(p, 1016 brw_null_reg(), /* dest */ 1017 inst->base_mrf, /* starting mrf reg nr */ 1018 header, 1019 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD | 1020 BRW_URB_WRITE_USE_CHANNEL_MASKS, 1021 inst->mlen, 1022 0, /* response len */ 1023 0, /* urb destination offset */ 1024 0); 1025 } 1026 1027 static void 1028 generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) 1029 { 1030 brw_push_insn_state(p); 1031 brw_set_default_access_mode(p, BRW_ALIGN_1); 1032 brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D)); 1033 brw_pop_insn_state(p); 1034 } 1035 1036 static void 1037 generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) 1038 { 1039 brw_push_insn_state(p); 1040 brw_set_default_access_mode(p, BRW_ALIGN_1); 1041 brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); 1042 brw_pop_insn_state(p); 1043 } 1044 1045 static void 1046 generate_tcs_create_barrier_header(struct brw_codegen *p, 1047 struct brw_vue_prog_data *prog_data, 1048 struct brw_reg dst) 1049 { 1050 const struct gen_device_info *devinfo = p->devinfo; 1051 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; 1052 struct brw_reg m0_2 = get_element_ud(dst, 2); 1053 unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; 1054 1055 brw_push_insn_state(p); 1056 brw_set_default_access_mode(p, BRW_ALIGN_1); 1057 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1058 1059 /* Zero the message header */ 1060 brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); 1061 1062 /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */ 1063 brw_AND(p, m0_2, 1064 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 1065 brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13))); 1066 1067 /* Shift it up to bits 27:24. */ 1068 brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11)); 1069 1070 /* Set the Barrier Count and the enable bit */ 1071 brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); 1072 1073 brw_pop_insn_state(p); 1074 } 1075 1076 static void 1077 generate_oword_dual_block_offsets(struct brw_codegen *p, 1078 struct brw_reg m1, 1079 struct brw_reg index) 1080 { 1081 int second_vertex_offset; 1082 1083 if (p->devinfo->gen >= 6) 1084 second_vertex_offset = 1; 1085 else 1086 second_vertex_offset = 16; 1087 1088 m1 = retype(m1, BRW_REGISTER_TYPE_D); 1089 1090 /* Set up M1 (message payload). Only the block offsets in M1.0 and 1091 * M1.4 are used, and the rest are ignored. 1092 */ 1093 struct brw_reg m1_0 = suboffset(vec1(m1), 0); 1094 struct brw_reg m1_4 = suboffset(vec1(m1), 4); 1095 struct brw_reg index_0 = suboffset(vec1(index), 0); 1096 struct brw_reg index_4 = suboffset(vec1(index), 4); 1097 1098 brw_push_insn_state(p); 1099 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1100 brw_set_default_access_mode(p, BRW_ALIGN_1); 1101 1102 brw_MOV(p, m1_0, index_0); 1103 1104 if (index.file == BRW_IMMEDIATE_VALUE) { 1105 index_4.ud += second_vertex_offset; 1106 brw_MOV(p, m1_4, index_4); 1107 } else { 1108 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset)); 1109 } 1110 1111 brw_pop_insn_state(p); 1112 } 1113 1114 static void 1115 generate_unpack_flags(struct brw_codegen *p, 1116 struct brw_reg dst) 1117 { 1118 brw_push_insn_state(p); 1119 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1120 brw_set_default_access_mode(p, BRW_ALIGN_1); 1121 1122 struct brw_reg flags = brw_flag_reg(0, 0); 1123 struct brw_reg dst_0 = suboffset(vec1(dst), 0); 1124 struct brw_reg dst_4 = suboffset(vec1(dst), 4); 1125 1126 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f)); 1127 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0)); 1128 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4)); 1129 1130 brw_pop_insn_state(p); 1131 } 1132 1133 static void 1134 generate_scratch_read(struct brw_codegen *p, 1135 vec4_instruction *inst, 1136 struct brw_reg dst, 1137 struct brw_reg index) 1138 { 1139 const struct gen_device_info *devinfo = p->devinfo; 1140 struct brw_reg header = brw_vec8_grf(0, 0); 1141 1142 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1143 1144 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), 1145 index); 1146 1147 uint32_t msg_type; 1148 1149 if (devinfo->gen >= 6) 1150 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1151 else if (devinfo->gen == 5 || devinfo->is_g4x) 1152 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1153 else 1154 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1155 1156 const unsigned target_cache = 1157 devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : 1158 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : 1159 BRW_DATAPORT_READ_TARGET_RENDER_CACHE; 1160 1161 /* Each of the 8 channel enables is considered for whether each 1162 * dword is written. 1163 */ 1164 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1165 brw_set_dest(p, send, dst); 1166 brw_set_src0(p, send, header); 1167 if (devinfo->gen < 6) 1168 brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf); 1169 brw_set_dp_read_message(p, send, 1170 brw_scratch_surface_idx(p), 1171 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1172 msg_type, target_cache, 1173 2, /* mlen */ 1174 true, /* header_present */ 1175 1 /* rlen */); 1176 } 1177 1178 static void 1179 generate_scratch_write(struct brw_codegen *p, 1180 vec4_instruction *inst, 1181 struct brw_reg dst, 1182 struct brw_reg src, 1183 struct brw_reg index) 1184 { 1185 const struct gen_device_info *devinfo = p->devinfo; 1186 const unsigned target_cache = 1187 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : 1188 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : 1189 BRW_DATAPORT_READ_TARGET_RENDER_CACHE); 1190 struct brw_reg header = brw_vec8_grf(0, 0); 1191 bool write_commit; 1192 1193 /* If the instruction is predicated, we'll predicate the send, not 1194 * the header setup. 1195 */ 1196 brw_set_default_predicate_control(p, false); 1197 1198 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1199 1200 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), 1201 index); 1202 1203 brw_MOV(p, 1204 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D), 1205 retype(src, BRW_REGISTER_TYPE_D)); 1206 1207 uint32_t msg_type; 1208 1209 if (devinfo->gen >= 7) 1210 msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE; 1211 else if (devinfo->gen == 6) 1212 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; 1213 else 1214 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; 1215 1216 brw_set_default_predicate_control(p, inst->predicate); 1217 1218 /* Pre-gen6, we have to specify write commits to ensure ordering 1219 * between reads and writes within a thread. Afterwards, that's 1220 * guaranteed and write commits only matter for inter-thread 1221 * synchronization. 1222 */ 1223 if (devinfo->gen >= 6) { 1224 write_commit = false; 1225 } else { 1226 /* The visitor set up our destination register to be g0. This 1227 * means that when the next read comes along, we will end up 1228 * reading from g0 and causing a block on the write commit. For 1229 * write-after-read, we are relying on the value of the previous 1230 * read being used (and thus blocking on completion) before our 1231 * write is executed. This means we have to be careful in 1232 * instruction scheduling to not violate this assumption. 1233 */ 1234 write_commit = true; 1235 } 1236 1237 /* Each of the 8 channel enables is considered for whether each 1238 * dword is written. 1239 */ 1240 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1241 brw_set_dest(p, send, dst); 1242 brw_set_src0(p, send, header); 1243 if (devinfo->gen < 6) 1244 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); 1245 brw_set_dp_write_message(p, send, 1246 brw_scratch_surface_idx(p), 1247 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1248 msg_type, 1249 target_cache, 1250 3, /* mlen */ 1251 true, /* header present */ 1252 false, /* not a render target write */ 1253 write_commit, /* rlen */ 1254 false, /* eot */ 1255 write_commit); 1256 } 1257 1258 static void 1259 generate_pull_constant_load(struct brw_codegen *p, 1260 struct brw_vue_prog_data *prog_data, 1261 vec4_instruction *inst, 1262 struct brw_reg dst, 1263 struct brw_reg index, 1264 struct brw_reg offset) 1265 { 1266 const struct gen_device_info *devinfo = p->devinfo; 1267 const unsigned target_cache = 1268 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE : 1269 BRW_DATAPORT_READ_TARGET_DATA_CACHE); 1270 assert(index.file == BRW_IMMEDIATE_VALUE && 1271 index.type == BRW_REGISTER_TYPE_UD); 1272 uint32_t surf_index = index.ud; 1273 1274 struct brw_reg header = brw_vec8_grf(0, 0); 1275 1276 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1277 1278 if (devinfo->gen >= 6) { 1279 if (offset.file == BRW_IMMEDIATE_VALUE) { 1280 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), 1281 BRW_REGISTER_TYPE_D), 1282 brw_imm_d(offset.ud >> 4)); 1283 } else { 1284 brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1), 1285 BRW_REGISTER_TYPE_D), 1286 offset, brw_imm_d(4)); 1287 } 1288 } else { 1289 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), 1290 BRW_REGISTER_TYPE_D), 1291 offset); 1292 } 1293 1294 uint32_t msg_type; 1295 1296 if (devinfo->gen >= 6) 1297 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1298 else if (devinfo->gen == 5 || devinfo->is_g4x) 1299 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1300 else 1301 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1302 1303 /* Each of the 8 channel enables is considered for whether each 1304 * dword is written. 1305 */ 1306 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1307 brw_set_dest(p, send, dst); 1308 brw_set_src0(p, send, header); 1309 if (devinfo->gen < 6) 1310 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); 1311 brw_set_dp_read_message(p, send, 1312 surf_index, 1313 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1314 msg_type, 1315 target_cache, 1316 2, /* mlen */ 1317 true, /* header_present */ 1318 1 /* rlen */); 1319 } 1320 1321 static void 1322 generate_get_buffer_size(struct brw_codegen *p, 1323 struct brw_vue_prog_data *prog_data, 1324 vec4_instruction *inst, 1325 struct brw_reg dst, 1326 struct brw_reg src, 1327 struct brw_reg surf_index) 1328 { 1329 assert(p->devinfo->gen >= 7); 1330 assert(surf_index.type == BRW_REGISTER_TYPE_UD && 1331 surf_index.file == BRW_IMMEDIATE_VALUE); 1332 1333 brw_SAMPLE(p, 1334 dst, 1335 inst->base_mrf, 1336 src, 1337 surf_index.ud, 1338 0, 1339 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 1340 1, /* response length */ 1341 inst->mlen, 1342 inst->header_size > 0, 1343 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1344 BRW_SAMPLER_RETURN_FORMAT_SINT32); 1345 1346 brw_mark_surface_used(&prog_data->base, surf_index.ud); 1347 } 1348 1349 static void 1350 generate_pull_constant_load_gen7(struct brw_codegen *p, 1351 struct brw_vue_prog_data *prog_data, 1352 vec4_instruction *inst, 1353 struct brw_reg dst, 1354 struct brw_reg surf_index, 1355 struct brw_reg offset) 1356 { 1357 assert(surf_index.type == BRW_REGISTER_TYPE_UD); 1358 1359 if (surf_index.file == BRW_IMMEDIATE_VALUE) { 1360 1361 brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); 1362 brw_set_dest(p, insn, dst); 1363 brw_set_src0(p, insn, offset); 1364 brw_set_sampler_message(p, insn, 1365 surf_index.ud, 1366 0, /* LD message ignores sampler unit */ 1367 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1368 1, /* rlen */ 1369 inst->mlen, 1370 inst->header_size != 0, 1371 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1372 0); 1373 1374 brw_mark_surface_used(&prog_data->base, surf_index.ud); 1375 1376 } else { 1377 1378 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1379 1380 brw_push_insn_state(p); 1381 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1382 brw_set_default_access_mode(p, BRW_ALIGN_1); 1383 1384 /* a0.0 = surf_index & 0xff */ 1385 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1386 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); 1387 brw_set_dest(p, insn_and, addr); 1388 brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD))); 1389 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1390 1391 brw_pop_insn_state(p); 1392 1393 /* dst = send(offset, a0.0 | <descriptor>) */ 1394 brw_inst *insn = brw_send_indirect_message( 1395 p, BRW_SFID_SAMPLER, dst, offset, addr); 1396 brw_set_sampler_message(p, insn, 1397 0 /* surface */, 1398 0 /* sampler */, 1399 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1400 1 /* rlen */, 1401 inst->mlen, 1402 inst->header_size != 0, 1403 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1404 0); 1405 } 1406 } 1407 1408 static void 1409 generate_set_simd4x2_header_gen9(struct brw_codegen *p, 1410 vec4_instruction *inst, 1411 struct brw_reg dst) 1412 { 1413 brw_push_insn_state(p); 1414 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1415 1416 brw_set_default_exec_size(p, BRW_EXECUTE_8); 1417 brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1418 1419 brw_set_default_access_mode(p, BRW_ALIGN_1); 1420 brw_MOV(p, get_element_ud(dst, 2), 1421 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); 1422 1423 brw_pop_insn_state(p); 1424 } 1425 1426 static void 1427 generate_mov_indirect(struct brw_codegen *p, 1428 vec4_instruction *inst, 1429 struct brw_reg dst, struct brw_reg reg, 1430 struct brw_reg indirect, struct brw_reg length) 1431 { 1432 assert(indirect.type == BRW_REGISTER_TYPE_UD); 1433 assert(p->devinfo->gen >= 6); 1434 1435 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2); 1436 1437 /* This instruction acts in align1 mode */ 1438 assert(dst.writemask == WRITEMASK_XYZW); 1439 1440 if (indirect.file == BRW_IMMEDIATE_VALUE) { 1441 imm_byte_offset += indirect.ud; 1442 1443 reg.nr = imm_byte_offset / REG_SIZE; 1444 reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2; 1445 unsigned shift = (imm_byte_offset / 4) % 4; 1446 reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); 1447 1448 brw_MOV(p, dst, reg); 1449 } else { 1450 brw_push_insn_state(p); 1451 brw_set_default_access_mode(p, BRW_ALIGN_1); 1452 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1453 1454 struct brw_reg addr = vec8(brw_address_reg(0)); 1455 1456 /* We need to move the indirect value into the address register. In 1457 * order to make things make some sense, we want to respect at least the 1458 * X component of the swizzle. In order to do that, we need to convert 1459 * the subnr (probably 0) to an align1 subnr and add in the swizzle. 1460 */ 1461 assert(brw_is_single_value_swizzle(indirect.swizzle)); 1462 indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)); 1463 1464 /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of 1465 * the indirect and splat it out to all four channels of the given half 1466 * of a0. 1467 */ 1468 indirect.subnr *= 2; 1469 indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0); 1470 brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset)); 1471 1472 /* Now we need to incorporate the swizzle from the source register */ 1473 if (reg.swizzle != BRW_SWIZZLE_XXXX) { 1474 uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 | 1475 BRW_GET_SWZ(reg.swizzle, 1) << 6 | 1476 BRW_GET_SWZ(reg.swizzle, 2) << 10 | 1477 BRW_GET_SWZ(reg.swizzle, 3) << 14; 1478 uv_swiz |= uv_swiz << 16; 1479 1480 brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz)); 1481 } 1482 1483 brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type)); 1484 1485 brw_pop_insn_state(p); 1486 } 1487 } 1488 1489 static void 1490 generate_code(struct brw_codegen *p, 1491 const struct brw_compiler *compiler, 1492 void *log_data, 1493 const nir_shader *nir, 1494 struct brw_vue_prog_data *prog_data, 1495 const struct cfg_t *cfg) 1496 { 1497 const struct gen_device_info *devinfo = p->devinfo; 1498 const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage); 1499 bool debug_flag = INTEL_DEBUG & 1500 intel_debug_flag_for_shader_stage(nir->stage); 1501 struct annotation_info annotation; 1502 memset(&annotation, 0, sizeof(annotation)); 1503 int spill_count = 0, fill_count = 0; 1504 int loop_count = 0; 1505 1506 foreach_block_and_inst (block, vec4_instruction, inst, cfg) { 1507 struct brw_reg src[3], dst; 1508 1509 if (unlikely(debug_flag)) 1510 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); 1511 1512 for (unsigned int i = 0; i < 3; i++) { 1513 src[i] = inst->src[i].as_brw_reg(); 1514 } 1515 dst = inst->dst.as_brw_reg(); 1516 1517 brw_set_default_predicate_control(p, inst->predicate); 1518 brw_set_default_predicate_inverse(p, inst->predicate_inverse); 1519 brw_set_default_flag_reg(p, 0, inst->flag_subreg); 1520 brw_set_default_saturate(p, inst->saturate); 1521 brw_set_default_mask_control(p, inst->force_writemask_all); 1522 brw_set_default_acc_write_control(p, inst->writes_accumulator); 1523 brw_set_default_exec_size(p, cvt(inst->exec_size) - 1); 1524 1525 assert(inst->group % inst->exec_size == 0); 1526 assert(inst->group % 8 == 0 || 1527 inst->dst.type == BRW_REGISTER_TYPE_DF || 1528 inst->src[0].type == BRW_REGISTER_TYPE_DF || 1529 inst->src[1].type == BRW_REGISTER_TYPE_DF || 1530 inst->src[2].type == BRW_REGISTER_TYPE_DF); 1531 if (!inst->force_writemask_all) 1532 brw_set_default_group(p, inst->group); 1533 1534 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); 1535 assert(inst->mlen <= BRW_MAX_MSG_LENGTH); 1536 1537 unsigned pre_emit_nr_insn = p->nr_insn; 1538 1539 switch (inst->opcode) { 1540 case VEC4_OPCODE_UNPACK_UNIFORM: 1541 case BRW_OPCODE_MOV: 1542 brw_MOV(p, dst, src[0]); 1543 break; 1544 case BRW_OPCODE_ADD: 1545 brw_ADD(p, dst, src[0], src[1]); 1546 break; 1547 case BRW_OPCODE_MUL: 1548 brw_MUL(p, dst, src[0], src[1]); 1549 break; 1550 case BRW_OPCODE_MACH: 1551 brw_MACH(p, dst, src[0], src[1]); 1552 break; 1553 1554 case BRW_OPCODE_MAD: 1555 assert(devinfo->gen >= 6); 1556 brw_MAD(p, dst, src[0], src[1], src[2]); 1557 break; 1558 1559 case BRW_OPCODE_FRC: 1560 brw_FRC(p, dst, src[0]); 1561 break; 1562 case BRW_OPCODE_RNDD: 1563 brw_RNDD(p, dst, src[0]); 1564 break; 1565 case BRW_OPCODE_RNDE: 1566 brw_RNDE(p, dst, src[0]); 1567 break; 1568 case BRW_OPCODE_RNDZ: 1569 brw_RNDZ(p, dst, src[0]); 1570 break; 1571 1572 case BRW_OPCODE_AND: 1573 brw_AND(p, dst, src[0], src[1]); 1574 break; 1575 case BRW_OPCODE_OR: 1576 brw_OR(p, dst, src[0], src[1]); 1577 break; 1578 case BRW_OPCODE_XOR: 1579 brw_XOR(p, dst, src[0], src[1]); 1580 break; 1581 case BRW_OPCODE_NOT: 1582 brw_NOT(p, dst, src[0]); 1583 break; 1584 case BRW_OPCODE_ASR: 1585 brw_ASR(p, dst, src[0], src[1]); 1586 break; 1587 case BRW_OPCODE_SHR: 1588 brw_SHR(p, dst, src[0], src[1]); 1589 break; 1590 case BRW_OPCODE_SHL: 1591 brw_SHL(p, dst, src[0], src[1]); 1592 break; 1593 1594 case BRW_OPCODE_CMP: 1595 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 1596 break; 1597 case BRW_OPCODE_SEL: 1598 brw_SEL(p, dst, src[0], src[1]); 1599 break; 1600 1601 case BRW_OPCODE_DPH: 1602 brw_DPH(p, dst, src[0], src[1]); 1603 break; 1604 1605 case BRW_OPCODE_DP4: 1606 brw_DP4(p, dst, src[0], src[1]); 1607 break; 1608 1609 case BRW_OPCODE_DP3: 1610 brw_DP3(p, dst, src[0], src[1]); 1611 break; 1612 1613 case BRW_OPCODE_DP2: 1614 brw_DP2(p, dst, src[0], src[1]); 1615 break; 1616 1617 case BRW_OPCODE_F32TO16: 1618 assert(devinfo->gen >= 7); 1619 brw_F32TO16(p, dst, src[0]); 1620 break; 1621 1622 case BRW_OPCODE_F16TO32: 1623 assert(devinfo->gen >= 7); 1624 brw_F16TO32(p, dst, src[0]); 1625 break; 1626 1627 case BRW_OPCODE_LRP: 1628 assert(devinfo->gen >= 6); 1629 brw_LRP(p, dst, src[0], src[1], src[2]); 1630 break; 1631 1632 case BRW_OPCODE_BFREV: 1633 assert(devinfo->gen >= 7); 1634 /* BFREV only supports UD type for src and dst. */ 1635 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), 1636 retype(src[0], BRW_REGISTER_TYPE_UD)); 1637 break; 1638 case BRW_OPCODE_FBH: 1639 assert(devinfo->gen >= 7); 1640 /* FBH only supports UD type for dst. */ 1641 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); 1642 break; 1643 case BRW_OPCODE_FBL: 1644 assert(devinfo->gen >= 7); 1645 /* FBL only supports UD type for dst. */ 1646 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); 1647 break; 1648 case BRW_OPCODE_LZD: 1649 brw_LZD(p, dst, src[0]); 1650 break; 1651 case BRW_OPCODE_CBIT: 1652 assert(devinfo->gen >= 7); 1653 /* CBIT only supports UD type for dst. */ 1654 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); 1655 break; 1656 case BRW_OPCODE_ADDC: 1657 assert(devinfo->gen >= 7); 1658 brw_ADDC(p, dst, src[0], src[1]); 1659 break; 1660 case BRW_OPCODE_SUBB: 1661 assert(devinfo->gen >= 7); 1662 brw_SUBB(p, dst, src[0], src[1]); 1663 break; 1664 case BRW_OPCODE_MAC: 1665 brw_MAC(p, dst, src[0], src[1]); 1666 break; 1667 1668 case BRW_OPCODE_BFE: 1669 assert(devinfo->gen >= 7); 1670 brw_BFE(p, dst, src[0], src[1], src[2]); 1671 break; 1672 1673 case BRW_OPCODE_BFI1: 1674 assert(devinfo->gen >= 7); 1675 brw_BFI1(p, dst, src[0], src[1]); 1676 break; 1677 case BRW_OPCODE_BFI2: 1678 assert(devinfo->gen >= 7); 1679 brw_BFI2(p, dst, src[0], src[1], src[2]); 1680 break; 1681 1682 case BRW_OPCODE_IF: 1683 if (!inst->src[0].is_null()) { 1684 /* The instruction has an embedded compare (only allowed on gen6) */ 1685 assert(devinfo->gen == 6); 1686 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 1687 } else { 1688 brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8); 1689 brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate); 1690 } 1691 break; 1692 1693 case BRW_OPCODE_ELSE: 1694 brw_ELSE(p); 1695 break; 1696 case BRW_OPCODE_ENDIF: 1697 brw_ENDIF(p); 1698 break; 1699 1700 case BRW_OPCODE_DO: 1701 brw_DO(p, BRW_EXECUTE_8); 1702 break; 1703 1704 case BRW_OPCODE_BREAK: 1705 brw_BREAK(p); 1706 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1707 break; 1708 case BRW_OPCODE_CONTINUE: 1709 brw_CONT(p); 1710 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1711 break; 1712 1713 case BRW_OPCODE_WHILE: 1714 brw_WHILE(p); 1715 loop_count++; 1716 break; 1717 1718 case SHADER_OPCODE_RCP: 1719 case SHADER_OPCODE_RSQ: 1720 case SHADER_OPCODE_SQRT: 1721 case SHADER_OPCODE_EXP2: 1722 case SHADER_OPCODE_LOG2: 1723 case SHADER_OPCODE_SIN: 1724 case SHADER_OPCODE_COS: 1725 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1726 if (devinfo->gen >= 7) { 1727 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], 1728 brw_null_reg()); 1729 } else if (devinfo->gen == 6) { 1730 generate_math_gen6(p, inst, dst, src[0], brw_null_reg()); 1731 } else { 1732 generate_math1_gen4(p, inst, dst, src[0]); 1733 } 1734 break; 1735 1736 case SHADER_OPCODE_POW: 1737 case SHADER_OPCODE_INT_QUOTIENT: 1738 case SHADER_OPCODE_INT_REMAINDER: 1739 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1740 if (devinfo->gen >= 7) { 1741 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); 1742 } else if (devinfo->gen == 6) { 1743 generate_math_gen6(p, inst, dst, src[0], src[1]); 1744 } else { 1745 generate_math2_gen4(p, inst, dst, src[0], src[1]); 1746 } 1747 break; 1748 1749 case SHADER_OPCODE_TEX: 1750 case SHADER_OPCODE_TXD: 1751 case SHADER_OPCODE_TXF: 1752 case SHADER_OPCODE_TXF_CMS: 1753 case SHADER_OPCODE_TXF_CMS_W: 1754 case SHADER_OPCODE_TXF_MCS: 1755 case SHADER_OPCODE_TXL: 1756 case SHADER_OPCODE_TXS: 1757 case SHADER_OPCODE_TG4: 1758 case SHADER_OPCODE_TG4_OFFSET: 1759 case SHADER_OPCODE_SAMPLEINFO: 1760 generate_tex(p, prog_data, nir->stage, 1761 inst, dst, src[0], src[1], src[2]); 1762 break; 1763 1764 case VS_OPCODE_URB_WRITE: 1765 generate_vs_urb_write(p, inst); 1766 break; 1767 1768 case SHADER_OPCODE_GEN4_SCRATCH_READ: 1769 generate_scratch_read(p, inst, dst, src[0]); 1770 fill_count++; 1771 break; 1772 1773 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 1774 generate_scratch_write(p, inst, dst, src[0], src[1]); 1775 spill_count++; 1776 break; 1777 1778 case VS_OPCODE_PULL_CONSTANT_LOAD: 1779 generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]); 1780 break; 1781 1782 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 1783 generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]); 1784 break; 1785 1786 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: 1787 generate_set_simd4x2_header_gen9(p, inst, dst); 1788 break; 1789 1790 1791 case VS_OPCODE_GET_BUFFER_SIZE: 1792 generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]); 1793 break; 1794 1795 case GS_OPCODE_URB_WRITE: 1796 generate_gs_urb_write(p, inst); 1797 break; 1798 1799 case GS_OPCODE_URB_WRITE_ALLOCATE: 1800 generate_gs_urb_write_allocate(p, inst); 1801 break; 1802 1803 case GS_OPCODE_SVB_WRITE: 1804 generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]); 1805 break; 1806 1807 case GS_OPCODE_SVB_SET_DST_INDEX: 1808 generate_gs_svb_set_destination_index(p, inst, dst, src[0]); 1809 break; 1810 1811 case GS_OPCODE_THREAD_END: 1812 generate_gs_thread_end(p, inst); 1813 break; 1814 1815 case GS_OPCODE_SET_WRITE_OFFSET: 1816 generate_gs_set_write_offset(p, dst, src[0], src[1]); 1817 break; 1818 1819 case GS_OPCODE_SET_VERTEX_COUNT: 1820 generate_gs_set_vertex_count(p, dst, src[0]); 1821 break; 1822 1823 case GS_OPCODE_FF_SYNC: 1824 generate_gs_ff_sync(p, inst, dst, src[0], src[1]); 1825 break; 1826 1827 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: 1828 generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]); 1829 break; 1830 1831 case GS_OPCODE_SET_PRIMITIVE_ID: 1832 generate_gs_set_primitive_id(p, dst); 1833 break; 1834 1835 case GS_OPCODE_SET_DWORD_2: 1836 generate_gs_set_dword_2(p, dst, src[0]); 1837 break; 1838 1839 case GS_OPCODE_PREPARE_CHANNEL_MASKS: 1840 generate_gs_prepare_channel_masks(p, dst); 1841 break; 1842 1843 case GS_OPCODE_SET_CHANNEL_MASKS: 1844 generate_gs_set_channel_masks(p, dst, src[0]); 1845 break; 1846 1847 case GS_OPCODE_GET_INSTANCE_ID: 1848 generate_gs_get_instance_id(p, dst); 1849 break; 1850 1851 case SHADER_OPCODE_SHADER_TIME_ADD: 1852 brw_shader_time_add(p, src[0], 1853 prog_data->base.binding_table.shader_time_start); 1854 brw_mark_surface_used(&prog_data->base, 1855 prog_data->base.binding_table.shader_time_start); 1856 break; 1857 1858 case SHADER_OPCODE_UNTYPED_ATOMIC: 1859 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1860 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, 1861 !inst->dst.is_null()); 1862 break; 1863 1864 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 1865 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1866 brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, 1867 src[2].ud); 1868 break; 1869 1870 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 1871 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1872 brw_untyped_surface_write(p, src[0], src[1], inst->mlen, 1873 src[2].ud); 1874 break; 1875 1876 case SHADER_OPCODE_TYPED_ATOMIC: 1877 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1878 brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, 1879 !inst->dst.is_null()); 1880 break; 1881 1882 case SHADER_OPCODE_TYPED_SURFACE_READ: 1883 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1884 brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen, 1885 src[2].ud); 1886 break; 1887 1888 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 1889 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1890 brw_typed_surface_write(p, src[0], src[1], inst->mlen, 1891 src[2].ud); 1892 break; 1893 1894 case SHADER_OPCODE_MEMORY_FENCE: 1895 brw_memory_fence(p, dst); 1896 break; 1897 1898 case SHADER_OPCODE_FIND_LIVE_CHANNEL: { 1899 const struct brw_reg mask = 1900 brw_stage_has_packed_dispatch(devinfo, nir->stage, 1901 &prog_data->base) ? brw_imm_ud(~0u) : 1902 brw_dmask_reg(); 1903 brw_find_live_channel(p, dst, mask); 1904 break; 1905 } 1906 1907 case SHADER_OPCODE_BROADCAST: 1908 assert(inst->force_writemask_all); 1909 brw_broadcast(p, dst, src[0], src[1]); 1910 break; 1911 1912 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: 1913 generate_unpack_flags(p, dst); 1914 break; 1915 1916 case VEC4_OPCODE_MOV_BYTES: { 1917 /* Moves the low byte from each channel, using an Align1 access mode 1918 * and a <4,1,0> source region. 1919 */ 1920 assert(src[0].type == BRW_REGISTER_TYPE_UB || 1921 src[0].type == BRW_REGISTER_TYPE_B); 1922 1923 brw_set_default_access_mode(p, BRW_ALIGN_1); 1924 src[0].vstride = BRW_VERTICAL_STRIDE_4; 1925 src[0].width = BRW_WIDTH_1; 1926 src[0].hstride = BRW_HORIZONTAL_STRIDE_0; 1927 brw_MOV(p, dst, src[0]); 1928 brw_set_default_access_mode(p, BRW_ALIGN_16); 1929 break; 1930 } 1931 1932 case VEC4_OPCODE_FROM_DOUBLE: { 1933 assert(type_sz(src[0].type) == 8); 1934 assert(type_sz(dst.type) == 4); 1935 1936 brw_set_default_access_mode(p, BRW_ALIGN_1); 1937 1938 dst.hstride = BRW_HORIZONTAL_STRIDE_2; 1939 dst.width = BRW_WIDTH_4; 1940 src[0].vstride = BRW_VERTICAL_STRIDE_4; 1941 src[0].width = BRW_WIDTH_4; 1942 brw_MOV(p, dst, src[0]); 1943 1944 struct brw_reg dst_as_src = dst; 1945 dst.hstride = BRW_HORIZONTAL_STRIDE_1; 1946 dst.width = BRW_WIDTH_8; 1947 brw_MOV(p, dst, dst_as_src); 1948 1949 brw_set_default_access_mode(p, BRW_ALIGN_16); 1950 break; 1951 } 1952 1953 case VEC4_OPCODE_TO_DOUBLE: { 1954 assert(type_sz(src[0].type) == 4); 1955 assert(type_sz(dst.type) == 8); 1956 1957 brw_set_default_access_mode(p, BRW_ALIGN_1); 1958 1959 struct brw_reg tmp = retype(dst, src[0].type); 1960 tmp.hstride = BRW_HORIZONTAL_STRIDE_2; 1961 tmp.width = BRW_WIDTH_4; 1962 src[0].vstride = BRW_VERTICAL_STRIDE_4; 1963 src[0].hstride = BRW_HORIZONTAL_STRIDE_1; 1964 src[0].width = BRW_WIDTH_4; 1965 brw_MOV(p, tmp, src[0]); 1966 1967 tmp.vstride = BRW_VERTICAL_STRIDE_8; 1968 tmp.hstride = BRW_HORIZONTAL_STRIDE_2; 1969 tmp.width = BRW_WIDTH_4; 1970 brw_MOV(p, dst, tmp); 1971 1972 brw_set_default_access_mode(p, BRW_ALIGN_16); 1973 break; 1974 } 1975 1976 case VEC4_OPCODE_PICK_LOW_32BIT: 1977 case VEC4_OPCODE_PICK_HIGH_32BIT: { 1978 /* Stores the low/high 32-bit of each 64-bit element in src[0] into 1979 * dst using ALIGN1 mode and a <8,4,2>:UD region on the source. 1980 */ 1981 assert(type_sz(src[0].type) == 8); 1982 assert(type_sz(dst.type) == 4); 1983 1984 brw_set_default_access_mode(p, BRW_ALIGN_1); 1985 1986 dst = retype(dst, BRW_REGISTER_TYPE_UD); 1987 dst.hstride = BRW_HORIZONTAL_STRIDE_1; 1988 1989 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); 1990 if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT) 1991 src[0] = suboffset(src[0], 1); 1992 src[0].vstride = BRW_VERTICAL_STRIDE_8; 1993 src[0].width = BRW_WIDTH_4; 1994 src[0].hstride = BRW_HORIZONTAL_STRIDE_2; 1995 brw_MOV(p, dst, src[0]); 1996 1997 brw_set_default_access_mode(p, BRW_ALIGN_16); 1998 break; 1999 } 2000 2001 case VEC4_OPCODE_SET_LOW_32BIT: 2002 case VEC4_OPCODE_SET_HIGH_32BIT: { 2003 /* Reads consecutive 32-bit elements from src[0] and writes 2004 * them to the low/high 32-bit of each 64-bit element in dst. 2005 */ 2006 assert(type_sz(src[0].type) == 4); 2007 assert(type_sz(dst.type) == 8); 2008 2009 brw_set_default_access_mode(p, BRW_ALIGN_1); 2010 2011 dst = retype(dst, BRW_REGISTER_TYPE_UD); 2012 if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT) 2013 dst = suboffset(dst, 1); 2014 dst.hstride = BRW_HORIZONTAL_STRIDE_2; 2015 2016 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); 2017 src[0].vstride = BRW_VERTICAL_STRIDE_4; 2018 src[0].width = BRW_WIDTH_4; 2019 src[0].hstride = BRW_HORIZONTAL_STRIDE_1; 2020 brw_MOV(p, dst, src[0]); 2021 2022 brw_set_default_access_mode(p, BRW_ALIGN_16); 2023 break; 2024 } 2025 2026 case VEC4_OPCODE_PACK_BYTES: { 2027 /* Is effectively: 2028 * 2029 * mov(8) dst<16,4,1>:UB src<4,1,0>:UB 2030 * 2031 * but destinations' only regioning is horizontal stride, so instead we 2032 * have to use two instructions: 2033 * 2034 * mov(4) dst<1>:UB src<4,1,0>:UB 2035 * mov(4) dst.16<1>:UB src.16<4,1,0>:UB 2036 * 2037 * where they pack the four bytes from the low and high four DW. 2038 */ 2039 assert(_mesa_is_pow_two(dst.writemask) && 2040 dst.writemask != 0); 2041 unsigned offset = __builtin_ctz(dst.writemask); 2042 2043 dst.type = BRW_REGISTER_TYPE_UB; 2044 2045 brw_set_default_access_mode(p, BRW_ALIGN_1); 2046 2047 src[0].type = BRW_REGISTER_TYPE_UB; 2048 src[0].vstride = BRW_VERTICAL_STRIDE_4; 2049 src[0].width = BRW_WIDTH_1; 2050 src[0].hstride = BRW_HORIZONTAL_STRIDE_0; 2051 dst.subnr = offset * 4; 2052 struct brw_inst *insn = brw_MOV(p, dst, src[0]); 2053 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); 2054 brw_inst_set_no_dd_clear(p->devinfo, insn, true); 2055 brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check); 2056 2057 src[0].subnr = 16; 2058 dst.subnr = 16 + offset * 4; 2059 insn = brw_MOV(p, dst, src[0]); 2060 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); 2061 brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear); 2062 brw_inst_set_no_dd_check(p->devinfo, insn, true); 2063 2064 brw_set_default_access_mode(p, BRW_ALIGN_16); 2065 break; 2066 } 2067 2068 case TCS_OPCODE_URB_WRITE: 2069 generate_tcs_urb_write(p, inst, src[0]); 2070 break; 2071 2072 case VEC4_OPCODE_URB_READ: 2073 generate_vec4_urb_read(p, inst, dst, src[0]); 2074 break; 2075 2076 case TCS_OPCODE_SET_INPUT_URB_OFFSETS: 2077 generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); 2078 break; 2079 2080 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: 2081 generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); 2082 break; 2083 2084 case TCS_OPCODE_GET_INSTANCE_ID: 2085 generate_tcs_get_instance_id(p, dst); 2086 break; 2087 2088 case TCS_OPCODE_GET_PRIMITIVE_ID: 2089 generate_tcs_get_primitive_id(p, dst); 2090 break; 2091 2092 case TCS_OPCODE_CREATE_BARRIER_HEADER: 2093 generate_tcs_create_barrier_header(p, prog_data, dst); 2094 break; 2095 2096 case TES_OPCODE_CREATE_INPUT_READ_HEADER: 2097 generate_tes_create_input_read_header(p, dst); 2098 break; 2099 2100 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: 2101 generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]); 2102 break; 2103 2104 case TES_OPCODE_GET_PRIMITIVE_ID: 2105 generate_tes_get_primitive_id(p, dst); 2106 break; 2107 2108 case TCS_OPCODE_SRC0_010_IS_ZERO: 2109 /* If src_reg had stride like fs_reg, we wouldn't need this. */ 2110 brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); 2111 break; 2112 2113 case TCS_OPCODE_RELEASE_INPUT: 2114 generate_tcs_release_input(p, dst, src[0], src[1]); 2115 break; 2116 2117 case TCS_OPCODE_THREAD_END: 2118 generate_tcs_thread_end(p, inst); 2119 break; 2120 2121 case SHADER_OPCODE_BARRIER: 2122 brw_barrier(p, src[0]); 2123 brw_WAIT(p); 2124 break; 2125 2126 case SHADER_OPCODE_MOV_INDIRECT: 2127 generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]); 2128 break; 2129 2130 case BRW_OPCODE_DIM: 2131 assert(devinfo->is_haswell); 2132 assert(src[0].type == BRW_REGISTER_TYPE_DF); 2133 assert(dst.type == BRW_REGISTER_TYPE_DF); 2134 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); 2135 break; 2136 2137 default: 2138 unreachable("Unsupported opcode"); 2139 } 2140 2141 if (inst->opcode == VEC4_OPCODE_PACK_BYTES) { 2142 /* Handled dependency hints in the generator. */ 2143 2144 assert(!inst->conditional_mod); 2145 } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { 2146 assert(p->nr_insn == pre_emit_nr_insn + 1 || 2147 !"conditional_mod, no_dd_check, or no_dd_clear set for IR " 2148 "emitting more than 1 instruction"); 2149 2150 brw_inst *last = &p->store[pre_emit_nr_insn]; 2151 2152 if (inst->conditional_mod) 2153 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); 2154 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); 2155 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); 2156 } 2157 } 2158 2159 brw_set_uip_jip(p, 0); 2160 annotation_finalize(&annotation, p->next_insn_offset); 2161 2162 #ifndef NDEBUG 2163 bool validated = brw_validate_instructions(p, 0, &annotation); 2164 #else 2165 if (unlikely(debug_flag)) 2166 brw_validate_instructions(p, 0, &annotation); 2167 #endif 2168 2169 int before_size = p->next_insn_offset; 2170 brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann); 2171 int after_size = p->next_insn_offset; 2172 2173 if (unlikely(debug_flag)) { 2174 fprintf(stderr, "Native code for %s %s shader %s:\n", 2175 nir->info->label ? nir->info->label : "unnamed", 2176 _mesa_shader_stage_to_string(nir->stage), nir->info->name); 2177 2178 fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " 2179 "spills:fills. Compacted %d to %d bytes (%.0f%%)\n", 2180 stage_abbrev, before_size / 16, loop_count, cfg->cycle_count, 2181 spill_count, fill_count, before_size, after_size, 2182 100.0f * (before_size - after_size) / before_size); 2183 2184 dump_assembly(p->store, annotation.ann_count, annotation.ann, 2185 p->devinfo); 2186 ralloc_free(annotation.mem_ctx); 2187 } 2188 assert(validated); 2189 2190 compiler->shader_debug_log(log_data, 2191 "%s vec4 shader: %d inst, %d loops, %u cycles, " 2192 "%d:%d spills:fills, compacted %d to %d bytes.", 2193 stage_abbrev, before_size / 16, 2194 loop_count, cfg->cycle_count, spill_count, 2195 fill_count, before_size, after_size); 2196 2197 } 2198 2199 extern "C" const unsigned * 2200 brw_vec4_generate_assembly(const struct brw_compiler *compiler, 2201 void *log_data, 2202 void *mem_ctx, 2203 const nir_shader *nir, 2204 struct brw_vue_prog_data *prog_data, 2205 const struct cfg_t *cfg, 2206 unsigned *out_assembly_size) 2207 { 2208 struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen); 2209 brw_init_codegen(compiler->devinfo, p, mem_ctx); 2210 brw_set_default_access_mode(p, BRW_ALIGN_16); 2211 2212 generate_code(p, compiler, log_data, nir, prog_data, cfg); 2213 2214 return brw_get_program(p, out_assembly_size); 2215 } 2216