1 /* 2 * Copyright 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 /** @file brw_fs_generator.cpp 25 * 26 * This file supports generating code from the FS LIR to the actual 27 * native instructions. 28 */ 29 30 #include "brw_eu.h" 31 #include "brw_fs.h" 32 #include "brw_cfg.h" 33 #include "brw_program.h" 34 35 static enum brw_reg_file 36 brw_file_from_reg(fs_reg *reg) 37 { 38 switch (reg->file) { 39 case ARF: 40 return BRW_ARCHITECTURE_REGISTER_FILE; 41 case FIXED_GRF: 42 case VGRF: 43 return BRW_GENERAL_REGISTER_FILE; 44 case MRF: 45 return BRW_MESSAGE_REGISTER_FILE; 46 case IMM: 47 return BRW_IMMEDIATE_VALUE; 48 case BAD_FILE: 49 case ATTR: 50 case UNIFORM: 51 unreachable("not reached"); 52 } 53 return BRW_ARCHITECTURE_REGISTER_FILE; 54 } 55 56 static struct brw_reg 57 brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed) 58 { 59 struct brw_reg brw_reg; 60 61 switch (reg->file) { 62 case MRF: 63 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen)); 64 /* Fallthrough */ 65 case VGRF: 66 if (reg->stride == 0) { 67 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); 68 } else { 69 /* From the Haswell PRM: 70 * 71 * "VertStride must be used to cross GRF register boundaries. This 72 * rule implies that elements within a 'Width' cannot cross GRF 73 * boundaries." 74 * 75 * The maximum width value that could satisfy this restriction is: 76 */ 77 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); 78 79 /* Because the hardware can only split source regions at a whole 80 * multiple of width during decompression (i.e. vertically), clamp 81 * the value obtained above to the physical execution size of a 82 * single decompressed chunk of the instruction: 83 */ 84 const unsigned phys_width = compressed ? inst->exec_size / 2 : 85 inst->exec_size; 86 87 /* XXX - The equation above is strictly speaking not correct on 88 * hardware that supports unbalanced GRF writes -- On Gen9+ 89 * each decompressed chunk of the instruction may have a 90 * different execution size when the number of components 91 * written to each destination GRF is not the same. 92 */ 93 const unsigned width = MIN2(reg_width, phys_width); 94 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); 95 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); 96 } 97 98 brw_reg = retype(brw_reg, reg->type); 99 brw_reg = byte_offset(brw_reg, reg->offset); 100 brw_reg.abs = reg->abs; 101 brw_reg.negate = reg->negate; 102 break; 103 case ARF: 104 case FIXED_GRF: 105 case IMM: 106 assert(reg->offset == 0); 107 brw_reg = reg->as_brw_reg(); 108 break; 109 case BAD_FILE: 110 /* Probably unused. */ 111 brw_reg = brw_null_reg(); 112 break; 113 case ATTR: 114 case UNIFORM: 115 unreachable("not reached"); 116 } 117 118 return brw_reg; 119 } 120 121 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data, 122 void *mem_ctx, 123 const void *key, 124 struct brw_stage_prog_data *prog_data, 125 unsigned promoted_constants, 126 bool runtime_check_aads_emit, 127 gl_shader_stage stage) 128 129 : compiler(compiler), log_data(log_data), 130 devinfo(compiler->devinfo), key(key), 131 prog_data(prog_data), 132 promoted_constants(promoted_constants), 133 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false), 134 stage(stage), mem_ctx(mem_ctx) 135 { 136 p = rzalloc(mem_ctx, struct brw_codegen); 137 brw_init_codegen(devinfo, p, mem_ctx); 138 } 139 140 fs_generator::~fs_generator() 141 { 142 } 143 144 class ip_record : public exec_node { 145 public: 146 DECLARE_RALLOC_CXX_OPERATORS(ip_record) 147 148 ip_record(int ip) 149 { 150 this->ip = ip; 151 } 152 153 int ip; 154 }; 155 156 bool 157 fs_generator::patch_discard_jumps_to_fb_writes() 158 { 159 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty()) 160 return false; 161 162 int scale = brw_jump_scale(p->devinfo); 163 164 /* There is a somewhat strange undocumented requirement of using 165 * HALT, according to the simulator. If some channel has HALTed to 166 * a particular UIP, then by the end of the program, every channel 167 * must have HALTed to that UIP. Furthermore, the tracking is a 168 * stack, so you can't do the final halt of a UIP after starting 169 * halting to a new UIP. 170 * 171 * Symptoms of not emitting this instruction on actual hardware 172 * included GPU hangs and sparkly rendering on the piglit discard 173 * tests. 174 */ 175 brw_inst *last_halt = gen6_HALT(p); 176 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale); 177 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale); 178 179 int ip = p->nr_insn; 180 181 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) { 182 brw_inst *patch = &p->store[patch_ip->ip]; 183 184 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT); 185 /* HALT takes a half-instruction distance from the pre-incremented IP. */ 186 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale); 187 } 188 189 this->discard_halt_patches.make_empty(); 190 return true; 191 } 192 193 void 194 fs_generator::fire_fb_write(fs_inst *inst, 195 struct brw_reg payload, 196 struct brw_reg implied_header, 197 GLuint nr) 198 { 199 uint32_t msg_control; 200 201 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 202 203 if (devinfo->gen < 6) { 204 brw_push_insn_state(p); 205 brw_set_default_exec_size(p, BRW_EXECUTE_8); 206 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 207 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 208 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 209 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0)); 210 brw_pop_insn_state(p); 211 } 212 213 if (inst->opcode == FS_OPCODE_REP_FB_WRITE) 214 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; 215 else if (prog_data->dual_src_blend) { 216 if (!inst->group) 217 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; 218 else 219 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23; 220 } else if (inst->exec_size == 16) 221 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; 222 else 223 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01; 224 225 uint32_t surf_index = 226 prog_data->binding_table.render_target_start + inst->target; 227 228 bool last_render_target = inst->eot || 229 (prog_data->dual_src_blend && dispatch_width == 16); 230 231 232 brw_fb_WRITE(p, 233 payload, 234 implied_header, 235 msg_control, 236 surf_index, 237 nr, 238 0, 239 inst->eot, 240 last_render_target, 241 inst->header_size != 0); 242 243 brw_mark_surface_used(&prog_data->base, surf_index); 244 } 245 246 void 247 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload) 248 { 249 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 250 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key; 251 struct brw_reg implied_header; 252 253 if (devinfo->gen < 8 && !devinfo->is_haswell) { 254 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 255 } 256 257 if (inst->base_mrf >= 0) 258 payload = brw_message_reg(inst->base_mrf); 259 260 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied 261 * move, here's g1. 262 */ 263 if (inst->header_size != 0) { 264 brw_push_insn_state(p); 265 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 266 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 267 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 268 brw_set_default_flag_reg(p, 0, 0); 269 270 /* On HSW, the GPU will use the predicate on SENDC, unless the header is 271 * present. 272 */ 273 if (prog_data->uses_kill) { 274 struct brw_reg pixel_mask; 275 276 if (devinfo->gen >= 6) 277 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 278 else 279 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 280 281 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1)); 282 } 283 284 if (devinfo->gen >= 6) { 285 brw_push_insn_state(p); 286 brw_set_default_exec_size(p, BRW_EXECUTE_16); 287 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED); 288 brw_MOV(p, 289 retype(payload, BRW_REGISTER_TYPE_UD), 290 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 291 brw_pop_insn_state(p); 292 293 if (inst->target > 0 && key->replicate_alpha) { 294 /* Set "Source0 Alpha Present to RenderTarget" bit in message 295 * header. 296 */ 297 brw_OR(p, 298 vec1(retype(payload, BRW_REGISTER_TYPE_UD)), 299 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), 300 brw_imm_ud(0x1 << 11)); 301 } 302 303 if (inst->target > 0) { 304 /* Set the render target index for choosing BLEND_STATE. */ 305 brw_MOV(p, retype(vec1(suboffset(payload, 2)), 306 BRW_REGISTER_TYPE_UD), 307 brw_imm_ud(inst->target)); 308 } 309 310 /* Set computes stencil to render target */ 311 if (prog_data->computed_stencil) { 312 brw_OR(p, 313 vec1(retype(payload, BRW_REGISTER_TYPE_UD)), 314 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)), 315 brw_imm_ud(0x1 << 14)); 316 } 317 318 implied_header = brw_null_reg(); 319 } else { 320 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 321 } 322 323 brw_pop_insn_state(p); 324 } else { 325 implied_header = brw_null_reg(); 326 } 327 328 if (!runtime_check_aads_emit) { 329 fire_fb_write(inst, payload, implied_header, inst->mlen); 330 } else { 331 /* This can only happen in gen < 6 */ 332 assert(devinfo->gen < 6); 333 334 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 335 336 /* Check runtime bit to detect if we have to send AA data or not */ 337 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 338 brw_AND(p, 339 v1_null_ud, 340 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD), 341 brw_imm_ud(1<<26)); 342 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ); 343 344 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store; 345 brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1); 346 { 347 /* Don't send AA data */ 348 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1); 349 } 350 brw_land_fwd_jump(p, jmp); 351 fire_fb_write(inst, payload, implied_header, inst->mlen); 352 } 353 } 354 355 void 356 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst, 357 struct brw_reg payload) 358 { 359 assert(inst->size_written % REG_SIZE == 0); 360 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 361 const unsigned surf_index = 362 prog_data->binding_table.render_target_start + inst->target; 363 364 gen9_fb_READ(p, dst, payload, surf_index, 365 inst->header_size, inst->size_written / REG_SIZE, 366 prog_data->persample_dispatch); 367 368 brw_mark_surface_used(&prog_data->base, surf_index); 369 } 370 371 void 372 fs_generator::generate_mov_indirect(fs_inst *inst, 373 struct brw_reg dst, 374 struct brw_reg reg, 375 struct brw_reg indirect_byte_offset) 376 { 377 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD); 378 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE); 379 380 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr; 381 382 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) { 383 imm_byte_offset += indirect_byte_offset.ud; 384 385 reg.nr = imm_byte_offset / REG_SIZE; 386 reg.subnr = imm_byte_offset % REG_SIZE; 387 brw_MOV(p, dst, reg); 388 } else { 389 /* Prior to Broadwell, there are only 8 address registers. */ 390 assert(inst->exec_size == 8 || devinfo->gen >= 8); 391 392 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */ 393 struct brw_reg addr = vec8(brw_address_reg(0)); 394 395 /* The destination stride of an instruction (in bytes) must be greater 396 * than or equal to the size of the rest of the instruction. Since the 397 * address register is of type UW, we can't use a D-type instruction. 398 * In order to get around this, re retype to UW and use a stride. 399 */ 400 indirect_byte_offset = 401 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW); 402 403 /* There are a number of reasons why we don't use the base offset here. 404 * One reason is that the field is only 9 bits which means we can only 405 * use it to access the first 16 GRFs. Also, from the Haswell PRM 406 * section "Register Region Restrictions": 407 * 408 * "The lower bits of the AddressImmediate must not overflow to 409 * change the register address. The lower 5 bits of Address 410 * Immediate when added to lower 5 bits of address register gives 411 * the sub-register offset. The upper bits of Address Immediate 412 * when added to upper bits of address register gives the register 413 * address. Any overflow from sub-register offset is dropped." 414 * 415 * Since the indirect may cause us to cross a register boundary, this 416 * makes the base offset almost useless. We could try and do something 417 * clever where we use a actual base offset if base_offset % 32 == 0 but 418 * that would mean we were generating different code depending on the 419 * base offset. Instead, for the sake of consistency, we'll just do the 420 * add ourselves. This restriction is only listed in the Haswell PRM 421 * but empirical testing indicates that it applies on all older 422 * generations and is lifted on Broadwell. 423 * 424 * In the end, while base_offset is nice to look at in the generated 425 * code, using it saves us 0 instructions and would require quite a bit 426 * of case-by-case work. It's just not worth it. 427 */ 428 brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset)); 429 struct brw_reg ind_src = brw_VxH_indirect(0, 0); 430 431 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type)); 432 433 if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE && 434 !inst->get_next()->is_tail_sentinel() && 435 ((fs_inst *)inst->get_next())->mlen > 0) { 436 /* From the Sandybridge PRM: 437 * 438 * "[Errata: DevSNB(SNB)] If MRF register is updated by any 439 * instruction that indexed/indirect source AND is followed by a 440 * send, the instruction requires a Switch. This is to avoid 441 * race condition where send may dispatch before MRF is updated." 442 */ 443 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH); 444 } 445 } 446 } 447 448 void 449 fs_generator::generate_urb_read(fs_inst *inst, 450 struct brw_reg dst, 451 struct brw_reg header) 452 { 453 assert(inst->size_written % REG_SIZE == 0); 454 assert(header.file == BRW_GENERAL_REGISTER_FILE); 455 assert(header.type == BRW_REGISTER_TYPE_UD); 456 457 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 458 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); 459 brw_set_src0(p, send, header); 460 brw_set_src1(p, send, brw_imm_ud(0u)); 461 462 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB); 463 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ); 464 465 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT) 466 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true); 467 468 brw_inst_set_mlen(p->devinfo, send, inst->mlen); 469 brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE); 470 brw_inst_set_header_present(p->devinfo, send, true); 471 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset); 472 } 473 474 void 475 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload) 476 { 477 brw_inst *insn; 478 479 insn = brw_next_insn(p, BRW_OPCODE_SEND); 480 481 brw_set_dest(p, insn, brw_null_reg()); 482 brw_set_src0(p, insn, payload); 483 brw_set_src1(p, insn, brw_imm_d(0)); 484 485 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB); 486 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE); 487 488 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || 489 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) 490 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true); 491 492 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || 493 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) 494 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true); 495 496 brw_inst_set_mlen(p->devinfo, insn, inst->mlen); 497 brw_inst_set_rlen(p->devinfo, insn, 0); 498 brw_inst_set_eot(p->devinfo, insn, inst->eot); 499 brw_inst_set_header_present(p->devinfo, insn, true); 500 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset); 501 } 502 503 void 504 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload) 505 { 506 struct brw_inst *insn; 507 508 insn = brw_next_insn(p, BRW_OPCODE_SEND); 509 510 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW)); 511 brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW)); 512 brw_set_src1(p, insn, brw_imm_d(0)); 513 514 /* Terminate a compute shader by sending a message to the thread spawner. 515 */ 516 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER); 517 brw_inst_set_mlen(devinfo, insn, 1); 518 brw_inst_set_rlen(devinfo, insn, 0); 519 brw_inst_set_eot(devinfo, insn, inst->eot); 520 brw_inst_set_header_present(devinfo, insn, false); 521 522 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */ 523 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */ 524 525 /* Note that even though the thread has a URB resource associated with it, 526 * we set the "do not dereference URB" bit, because the URB resource is 527 * managed by the fixed-function unit, so it will free it automatically. 528 */ 529 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */ 530 531 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE); 532 } 533 534 void 535 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src) 536 { 537 brw_barrier(p, src); 538 brw_WAIT(p); 539 } 540 541 void 542 fs_generator::generate_linterp(fs_inst *inst, 543 struct brw_reg dst, struct brw_reg *src) 544 { 545 /* PLN reads: 546 * / in SIMD16 \ 547 * ----------------------------------- 548 * | src1+0 | src1+1 | src1+2 | src1+3 | 549 * |-----------------------------------| 550 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| 551 * ----------------------------------- 552 * 553 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: 554 * 555 * ----------------------------------- 556 * | src1+0 | src1+1 | src1+2 | src1+3 | 557 * |-----------------------------------| 558 * |(x0, x1)|(y0, y1)| | | in SIMD8 559 * |-----------------------------------| 560 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 561 * ----------------------------------- 562 * 563 * See also: emit_interpolation_setup_gen4(). 564 */ 565 struct brw_reg delta_x = src[0]; 566 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8); 567 struct brw_reg interp = src[1]; 568 569 if (devinfo->has_pln && 570 (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) { 571 brw_PLN(p, dst, interp, delta_x); 572 } else { 573 brw_LINE(p, brw_null_reg(), interp, delta_x); 574 brw_MAC(p, dst, suboffset(interp, 1), delta_y); 575 } 576 } 577 578 void 579 fs_generator::generate_get_buffer_size(fs_inst *inst, 580 struct brw_reg dst, 581 struct brw_reg src, 582 struct brw_reg surf_index) 583 { 584 assert(devinfo->gen >= 7); 585 assert(surf_index.file == BRW_IMMEDIATE_VALUE); 586 587 uint32_t simd_mode; 588 int rlen = 4; 589 590 switch (inst->exec_size) { 591 case 8: 592 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 593 break; 594 case 16: 595 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 596 break; 597 default: 598 unreachable("Invalid width for texture instruction"); 599 } 600 601 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 602 rlen = 8; 603 dst = vec16(dst); 604 } 605 606 brw_SAMPLE(p, 607 retype(dst, BRW_REGISTER_TYPE_UW), 608 inst->base_mrf, 609 src, 610 surf_index.ud, 611 0, 612 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 613 rlen, /* response length */ 614 inst->mlen, 615 inst->header_size > 0, 616 simd_mode, 617 BRW_SAMPLER_RETURN_FORMAT_SINT32); 618 619 brw_mark_surface_used(prog_data, surf_index.ud); 620 } 621 622 void 623 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src, 624 struct brw_reg surface_index, 625 struct brw_reg sampler_index) 626 { 627 assert(inst->size_written % REG_SIZE == 0); 628 int msg_type = -1; 629 uint32_t simd_mode; 630 uint32_t return_format; 631 bool is_combined_send = inst->eot; 632 633 switch (dst.type) { 634 case BRW_REGISTER_TYPE_D: 635 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 636 break; 637 case BRW_REGISTER_TYPE_UD: 638 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 639 break; 640 default: 641 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 642 break; 643 } 644 645 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type 646 * is set as part of the message descriptor. On gen4, the PRM seems to 647 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on 648 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is 649 * gone from the message descriptor entirely and you just get UINT32 all 650 * the time regasrdless. Since we can really only do non-UINT32 on gen4, 651 * just stomp it to UINT32 all the time. 652 */ 653 if (inst->opcode == SHADER_OPCODE_TXS) 654 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 655 656 switch (inst->exec_size) { 657 case 8: 658 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 659 break; 660 case 16: 661 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 662 break; 663 default: 664 unreachable("Invalid width for texture instruction"); 665 } 666 667 if (devinfo->gen >= 5) { 668 switch (inst->opcode) { 669 case SHADER_OPCODE_TEX: 670 if (inst->shadow_compare) { 671 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE; 672 } else { 673 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE; 674 } 675 break; 676 case FS_OPCODE_TXB: 677 if (inst->shadow_compare) { 678 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE; 679 } else { 680 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 681 } 682 break; 683 case SHADER_OPCODE_TXL: 684 if (inst->shadow_compare) { 685 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 686 } else { 687 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 688 } 689 break; 690 case SHADER_OPCODE_TXL_LZ: 691 assert(devinfo->gen >= 9); 692 if (inst->shadow_compare) { 693 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ; 694 } else { 695 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ; 696 } 697 break; 698 case SHADER_OPCODE_TXS: 699 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 700 break; 701 case SHADER_OPCODE_TXD: 702 if (inst->shadow_compare) { 703 /* Gen7.5+. Otherwise, lowered in NIR */ 704 assert(devinfo->gen >= 8 || devinfo->is_haswell); 705 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; 706 } else { 707 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 708 } 709 break; 710 case SHADER_OPCODE_TXF: 711 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 712 break; 713 case SHADER_OPCODE_TXF_LZ: 714 assert(devinfo->gen >= 9); 715 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ; 716 break; 717 case SHADER_OPCODE_TXF_CMS_W: 718 assert(devinfo->gen >= 9); 719 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; 720 break; 721 case SHADER_OPCODE_TXF_CMS: 722 if (devinfo->gen >= 7) 723 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; 724 else 725 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 726 break; 727 case SHADER_OPCODE_TXF_UMS: 728 assert(devinfo->gen >= 7); 729 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS; 730 break; 731 case SHADER_OPCODE_TXF_MCS: 732 assert(devinfo->gen >= 7); 733 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; 734 break; 735 case SHADER_OPCODE_LOD: 736 msg_type = GEN5_SAMPLER_MESSAGE_LOD; 737 break; 738 case SHADER_OPCODE_TG4: 739 if (inst->shadow_compare) { 740 assert(devinfo->gen >= 7); 741 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; 742 } else { 743 assert(devinfo->gen >= 6); 744 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 745 } 746 break; 747 case SHADER_OPCODE_TG4_OFFSET: 748 assert(devinfo->gen >= 7); 749 if (inst->shadow_compare) { 750 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; 751 } else { 752 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; 753 } 754 break; 755 case SHADER_OPCODE_SAMPLEINFO: 756 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 757 break; 758 default: 759 unreachable("not reached"); 760 } 761 } else { 762 switch (inst->opcode) { 763 case SHADER_OPCODE_TEX: 764 /* Note that G45 and older determines shadow compare and dispatch width 765 * from message length for most messages. 766 */ 767 if (inst->exec_size == 8) { 768 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; 769 if (inst->shadow_compare) { 770 assert(inst->mlen == 6); 771 } else { 772 assert(inst->mlen <= 4); 773 } 774 } else { 775 if (inst->shadow_compare) { 776 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE; 777 assert(inst->mlen == 9); 778 } else { 779 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE; 780 assert(inst->mlen <= 7 && inst->mlen % 2 == 1); 781 } 782 } 783 break; 784 case FS_OPCODE_TXB: 785 if (inst->shadow_compare) { 786 assert(inst->exec_size == 8); 787 assert(inst->mlen == 6); 788 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; 789 } else { 790 assert(inst->mlen == 9); 791 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; 792 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 793 } 794 break; 795 case SHADER_OPCODE_TXL: 796 if (inst->shadow_compare) { 797 assert(inst->exec_size == 8); 798 assert(inst->mlen == 6); 799 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; 800 } else { 801 assert(inst->mlen == 9); 802 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; 803 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 804 } 805 break; 806 case SHADER_OPCODE_TXD: 807 /* There is no sample_d_c message; comparisons are done manually */ 808 assert(inst->exec_size == 8); 809 assert(inst->mlen == 7 || inst->mlen == 10); 810 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS; 811 break; 812 case SHADER_OPCODE_TXF: 813 assert(inst->mlen <= 9 && inst->mlen % 2 == 1); 814 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 815 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 816 break; 817 case SHADER_OPCODE_TXS: 818 assert(inst->mlen == 3); 819 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO; 820 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 821 break; 822 default: 823 unreachable("not reached"); 824 } 825 } 826 assert(msg_type != -1); 827 828 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) { 829 dst = vec16(dst); 830 } 831 832 assert(devinfo->gen < 7 || inst->header_size == 0 || 833 src.file == BRW_GENERAL_REGISTER_FILE); 834 835 assert(sampler_index.type == BRW_REGISTER_TYPE_UD); 836 837 /* Load the message header if present. If there's a texture offset, 838 * we need to set it up explicitly and load the offset bitfield. 839 * Otherwise, we can use an implied move from g0 to the first message reg. 840 */ 841 if (inst->header_size != 0) { 842 if (devinfo->gen < 6 && !inst->offset) { 843 /* Set up an implied move from g0 to the MRF. */ 844 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); 845 } else { 846 struct brw_reg header_reg; 847 848 if (devinfo->gen >= 7) { 849 header_reg = src; 850 } else { 851 assert(inst->base_mrf != -1); 852 header_reg = brw_message_reg(inst->base_mrf); 853 } 854 855 brw_push_insn_state(p); 856 brw_set_default_exec_size(p, BRW_EXECUTE_8); 857 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 858 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 859 /* Explicitly set up the message header by copying g0 to the MRF. */ 860 brw_MOV(p, header_reg, brw_vec8_grf(0, 0)); 861 862 if (inst->offset) { 863 /* Set the offset bits in DWord 2. */ 864 brw_MOV(p, get_element_ud(header_reg, 2), 865 brw_imm_ud(inst->offset)); 866 } else if (stage != MESA_SHADER_VERTEX && 867 stage != MESA_SHADER_FRAGMENT) { 868 /* The vertex and fragment stages have g0.2 set to 0, so 869 * header0.2 is 0 when g0 is copied. Other stages may not, so we 870 * must set it to 0 to avoid setting undesirable bits in the 871 * message. 872 */ 873 brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0)); 874 } 875 876 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index); 877 brw_pop_insn_state(p); 878 } 879 } 880 881 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || 882 inst->opcode == SHADER_OPCODE_TG4_OFFSET) 883 ? prog_data->binding_table.gather_texture_start 884 : prog_data->binding_table.texture_start; 885 886 if (surface_index.file == BRW_IMMEDIATE_VALUE && 887 sampler_index.file == BRW_IMMEDIATE_VALUE) { 888 uint32_t surface = surface_index.ud; 889 uint32_t sampler = sampler_index.ud; 890 891 brw_SAMPLE(p, 892 retype(dst, BRW_REGISTER_TYPE_UW), 893 inst->base_mrf, 894 src, 895 surface + base_binding_table_index, 896 sampler % 16, 897 msg_type, 898 inst->size_written / REG_SIZE, 899 inst->mlen, 900 inst->header_size != 0, 901 simd_mode, 902 return_format); 903 904 brw_mark_surface_used(prog_data, surface + base_binding_table_index); 905 } else { 906 /* Non-const sampler index */ 907 908 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 909 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); 910 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); 911 912 brw_push_insn_state(p); 913 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 914 brw_set_default_access_mode(p, BRW_ALIGN_1); 915 916 if (brw_regs_equal(&surface_reg, &sampler_reg)) { 917 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); 918 } else { 919 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { 920 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); 921 } else { 922 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); 923 brw_OR(p, addr, addr, surface_reg); 924 } 925 } 926 if (base_binding_table_index) 927 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); 928 brw_AND(p, addr, addr, brw_imm_ud(0xfff)); 929 930 brw_pop_insn_state(p); 931 932 /* dst = send(offset, a0.0 | <descriptor>) */ 933 brw_inst *insn = brw_send_indirect_message( 934 p, BRW_SFID_SAMPLER, dst, src, addr); 935 brw_set_sampler_message(p, insn, 936 0 /* surface */, 937 0 /* sampler */, 938 msg_type, 939 inst->size_written / REG_SIZE, 940 inst->mlen /* mlen */, 941 inst->header_size != 0 /* header */, 942 simd_mode, 943 return_format); 944 945 /* visitor knows more than we do about the surface limit required, 946 * so has already done marking. 947 */ 948 } 949 950 if (is_combined_send) { 951 brw_inst_set_eot(p->devinfo, brw_last_inst, true); 952 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC); 953 } 954 } 955 956 957 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input 958 * looking like: 959 * 960 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br 961 * 962 * Ideally, we want to produce: 963 * 964 * DDX DDY 965 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) 966 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br) 967 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl) 968 * (ss0.br - ss0.bl) (ss0.tr - ss0.br) 969 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl) 970 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br) 971 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl) 972 * (ss1.br - ss1.bl) (ss1.tr - ss1.br) 973 * 974 * and add another set of two more subspans if in 16-pixel dispatch mode. 975 * 976 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result 977 * for each pair, and vertstride = 2 jumps us 2 elements after processing a 978 * pair. But the ideal approximation may impose a huge performance cost on 979 * sample_d. On at least Haswell, sample_d instruction does some 980 * optimizations if the same LOD is used for all pixels in the subspan. 981 * 982 * For DDY, we need to use ALIGN16 mode since it's capable of doing the 983 * appropriate swizzling. 984 */ 985 void 986 fs_generator::generate_ddx(enum opcode opcode, 987 struct brw_reg dst, struct brw_reg src) 988 { 989 unsigned vstride, width; 990 991 if (opcode == FS_OPCODE_DDX_FINE) { 992 /* produce accurate derivatives */ 993 vstride = BRW_VERTICAL_STRIDE_2; 994 width = BRW_WIDTH_2; 995 } else { 996 /* replicate the derivative at the top-left pixel to other pixels */ 997 vstride = BRW_VERTICAL_STRIDE_4; 998 width = BRW_WIDTH_4; 999 } 1000 1001 struct brw_reg src0 = brw_reg(src.file, src.nr, 1, 1002 src.negate, src.abs, 1003 BRW_REGISTER_TYPE_F, 1004 vstride, 1005 width, 1006 BRW_HORIZONTAL_STRIDE_0, 1007 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 1008 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 1009 src.negate, src.abs, 1010 BRW_REGISTER_TYPE_F, 1011 vstride, 1012 width, 1013 BRW_HORIZONTAL_STRIDE_0, 1014 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 1015 brw_ADD(p, dst, src0, negate(src1)); 1016 } 1017 1018 /* The negate_value boolean is used to negate the derivative computation for 1019 * FBOs, since they place the origin at the upper left instead of the lower 1020 * left. 1021 */ 1022 void 1023 fs_generator::generate_ddy(enum opcode opcode, 1024 struct brw_reg dst, struct brw_reg src) 1025 { 1026 if (opcode == FS_OPCODE_DDY_FINE) { 1027 /* produce accurate derivatives */ 1028 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 1029 src.negate, src.abs, 1030 BRW_REGISTER_TYPE_F, 1031 BRW_VERTICAL_STRIDE_4, 1032 BRW_WIDTH_4, 1033 BRW_HORIZONTAL_STRIDE_1, 1034 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW); 1035 struct brw_reg src1 = brw_reg(src.file, src.nr, 0, 1036 src.negate, src.abs, 1037 BRW_REGISTER_TYPE_F, 1038 BRW_VERTICAL_STRIDE_4, 1039 BRW_WIDTH_4, 1040 BRW_HORIZONTAL_STRIDE_1, 1041 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW); 1042 brw_push_insn_state(p); 1043 brw_set_default_access_mode(p, BRW_ALIGN_16); 1044 brw_ADD(p, dst, negate(src0), src1); 1045 brw_pop_insn_state(p); 1046 } else { 1047 /* replicate the derivative at the top-left pixel to other pixels */ 1048 struct brw_reg src0 = brw_reg(src.file, src.nr, 0, 1049 src.negate, src.abs, 1050 BRW_REGISTER_TYPE_F, 1051 BRW_VERTICAL_STRIDE_4, 1052 BRW_WIDTH_4, 1053 BRW_HORIZONTAL_STRIDE_0, 1054 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 1055 struct brw_reg src1 = brw_reg(src.file, src.nr, 2, 1056 src.negate, src.abs, 1057 BRW_REGISTER_TYPE_F, 1058 BRW_VERTICAL_STRIDE_4, 1059 BRW_WIDTH_4, 1060 BRW_HORIZONTAL_STRIDE_0, 1061 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 1062 brw_ADD(p, dst, negate(src0), src1); 1063 } 1064 } 1065 1066 void 1067 fs_generator::generate_discard_jump(fs_inst *inst) 1068 { 1069 assert(devinfo->gen >= 6); 1070 1071 /* This HALT will be patched up at FB write time to point UIP at the end of 1072 * the program, and at brw_uip_jip() JIP will be set to the end of the 1073 * current block (or the program). 1074 */ 1075 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); 1076 gen6_HALT(p); 1077 } 1078 1079 void 1080 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src) 1081 { 1082 /* The 32-wide messages only respect the first 16-wide half of the channel 1083 * enable signals which are replicated identically for the second group of 1084 * 16 channels, so we cannot use them unless the write is marked 1085 * force_writemask_all. 1086 */ 1087 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size : 1088 MIN2(16, inst->exec_size); 1089 const unsigned block_size = 4 * lower_size / REG_SIZE; 1090 assert(inst->mlen != 0); 1091 1092 brw_push_insn_state(p); 1093 brw_set_default_exec_size(p, cvt(lower_size) - 1); 1094 brw_set_default_compression(p, lower_size > 8); 1095 1096 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) { 1097 brw_set_default_group(p, inst->group + lower_size * i); 1098 1099 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0), 1100 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD)); 1101 1102 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1103 block_size, 1104 inst->offset + block_size * REG_SIZE * i); 1105 } 1106 1107 brw_pop_insn_state(p); 1108 } 1109 1110 void 1111 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst) 1112 { 1113 assert(inst->exec_size <= 16 || inst->force_writemask_all); 1114 assert(inst->mlen != 0); 1115 1116 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1117 inst->exec_size / 8, inst->offset); 1118 } 1119 1120 void 1121 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst) 1122 { 1123 assert(inst->exec_size <= 16 || inst->force_writemask_all); 1124 1125 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset); 1126 } 1127 1128 void 1129 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, 1130 struct brw_reg dst, 1131 struct brw_reg index, 1132 struct brw_reg offset) 1133 { 1134 assert(type_sz(dst.type) == 4); 1135 assert(inst->mlen != 0); 1136 1137 assert(index.file == BRW_IMMEDIATE_VALUE && 1138 index.type == BRW_REGISTER_TYPE_UD); 1139 uint32_t surf_index = index.ud; 1140 1141 assert(offset.file == BRW_IMMEDIATE_VALUE && 1142 offset.type == BRW_REGISTER_TYPE_UD); 1143 uint32_t read_offset = offset.ud; 1144 1145 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1146 read_offset, surf_index); 1147 } 1148 1149 void 1150 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, 1151 struct brw_reg dst, 1152 struct brw_reg index, 1153 struct brw_reg payload) 1154 { 1155 assert(index.type == BRW_REGISTER_TYPE_UD); 1156 assert(payload.file == BRW_GENERAL_REGISTER_FILE); 1157 assert(type_sz(dst.type) == 4); 1158 1159 if (index.file == BRW_IMMEDIATE_VALUE) { 1160 const uint32_t surf_index = index.ud; 1161 1162 brw_push_insn_state(p); 1163 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1164 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1165 brw_pop_insn_state(p); 1166 1167 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD)); 1168 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD)); 1169 brw_set_dp_read_message(p, send, surf_index, 1170 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), 1171 GEN7_DATAPORT_DC_OWORD_BLOCK_READ, 1172 GEN6_SFID_DATAPORT_CONSTANT_CACHE, 1173 1, /* mlen */ 1174 true, /* header */ 1175 DIV_ROUND_UP(inst->size_written, REG_SIZE)); 1176 1177 } else { 1178 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1179 1180 brw_push_insn_state(p); 1181 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1182 1183 /* a0.0 = surf_index & 0xff */ 1184 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1185 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); 1186 brw_set_dest(p, insn_and, addr); 1187 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); 1188 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1189 1190 /* dst = send(payload, a0.0 | <descriptor>) */ 1191 brw_inst *insn = brw_send_indirect_message( 1192 p, GEN6_SFID_DATAPORT_CONSTANT_CACHE, 1193 retype(dst, BRW_REGISTER_TYPE_UD), 1194 retype(payload, BRW_REGISTER_TYPE_UD), addr); 1195 brw_set_dp_read_message(p, insn, 0 /* surface */, 1196 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size), 1197 GEN7_DATAPORT_DC_OWORD_BLOCK_READ, 1198 GEN6_SFID_DATAPORT_CONSTANT_CACHE, 1199 1, /* mlen */ 1200 true, /* header */ 1201 DIV_ROUND_UP(inst->size_written, REG_SIZE)); 1202 1203 brw_pop_insn_state(p); 1204 } 1205 } 1206 1207 void 1208 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst, 1209 struct brw_reg dst, 1210 struct brw_reg index) 1211 { 1212 assert(devinfo->gen < 7); /* Should use the gen7 variant. */ 1213 assert(inst->header_size != 0); 1214 assert(inst->mlen); 1215 1216 assert(index.file == BRW_IMMEDIATE_VALUE && 1217 index.type == BRW_REGISTER_TYPE_UD); 1218 uint32_t surf_index = index.ud; 1219 1220 uint32_t simd_mode, rlen, msg_type; 1221 if (inst->exec_size == 16) { 1222 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1223 rlen = 8; 1224 } else { 1225 assert(inst->exec_size == 8); 1226 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1227 rlen = 4; 1228 } 1229 1230 if (devinfo->gen >= 5) 1231 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 1232 else { 1233 /* We always use the SIMD16 message so that we only have to load U, and 1234 * not V or R. 1235 */ 1236 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD; 1237 assert(inst->mlen == 3); 1238 assert(inst->size_written == 8 * REG_SIZE); 1239 rlen = 8; 1240 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1241 } 1242 1243 struct brw_reg header = brw_vec8_grf(0, 0); 1244 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1245 1246 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1247 brw_inst_set_compression(devinfo, send, false); 1248 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); 1249 brw_set_src0(p, send, header); 1250 if (devinfo->gen < 6) 1251 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf); 1252 1253 /* Our surface is set up as floats, regardless of what actual data is 1254 * stored in it. 1255 */ 1256 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 1257 brw_set_sampler_message(p, send, 1258 surf_index, 1259 0, /* sampler (unused) */ 1260 msg_type, 1261 rlen, 1262 inst->mlen, 1263 inst->header_size != 0, 1264 simd_mode, 1265 return_format); 1266 } 1267 1268 void 1269 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, 1270 struct brw_reg dst, 1271 struct brw_reg index, 1272 struct brw_reg offset) 1273 { 1274 assert(devinfo->gen >= 7); 1275 /* Varying-offset pull constant loads are treated as a normal expression on 1276 * gen7, so the fact that it's a send message is hidden at the IR level. 1277 */ 1278 assert(inst->header_size == 0); 1279 assert(!inst->mlen); 1280 assert(index.type == BRW_REGISTER_TYPE_UD); 1281 1282 uint32_t simd_mode, rlen, mlen; 1283 if (inst->exec_size == 16) { 1284 mlen = 2; 1285 rlen = 8; 1286 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; 1287 } else { 1288 assert(inst->exec_size == 8); 1289 mlen = 1; 1290 rlen = 4; 1291 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; 1292 } 1293 1294 if (index.file == BRW_IMMEDIATE_VALUE) { 1295 1296 uint32_t surf_index = index.ud; 1297 1298 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1299 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); 1300 brw_set_src0(p, send, offset); 1301 brw_set_sampler_message(p, send, 1302 surf_index, 1303 0, /* LD message ignores sampler unit */ 1304 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1305 rlen, 1306 mlen, 1307 false, /* no header */ 1308 simd_mode, 1309 0); 1310 1311 } else { 1312 1313 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1314 1315 brw_push_insn_state(p); 1316 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1317 1318 /* a0.0 = surf_index & 0xff */ 1319 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1320 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1); 1321 brw_set_dest(p, insn_and, addr); 1322 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD))); 1323 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1324 1325 brw_pop_insn_state(p); 1326 1327 /* dst = send(offset, a0.0 | <descriptor>) */ 1328 brw_inst *insn = brw_send_indirect_message( 1329 p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW), 1330 offset, addr); 1331 brw_set_sampler_message(p, insn, 1332 0 /* surface */, 1333 0 /* sampler */, 1334 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1335 rlen /* rlen */, 1336 mlen /* mlen */, 1337 false /* header */, 1338 simd_mode, 1339 0); 1340 } 1341 } 1342 1343 /** 1344 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred 1345 * into the flags register (f0.0). 1346 * 1347 * Used only on Gen6 and above. 1348 */ 1349 void 1350 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst) 1351 { 1352 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg); 1353 struct brw_reg dispatch_mask; 1354 1355 if (devinfo->gen >= 6) 1356 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW); 1357 else 1358 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); 1359 1360 brw_push_insn_state(p); 1361 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1362 brw_MOV(p, flags, dispatch_mask); 1363 brw_pop_insn_state(p); 1364 } 1365 1366 void 1367 fs_generator::generate_pixel_interpolator_query(fs_inst *inst, 1368 struct brw_reg dst, 1369 struct brw_reg src, 1370 struct brw_reg msg_data, 1371 unsigned msg_type) 1372 { 1373 assert(inst->size_written % REG_SIZE == 0); 1374 assert(msg_data.type == BRW_REGISTER_TYPE_UD); 1375 1376 brw_pixel_interpolator_query(p, 1377 retype(dst, BRW_REGISTER_TYPE_UW), 1378 src, 1379 inst->pi_noperspective, 1380 msg_type, 1381 msg_data, 1382 inst->mlen, 1383 inst->size_written / REG_SIZE); 1384 } 1385 1386 /* Sets vstride=1, width=4, hstride=0 of register src1 during 1387 * the ADD instruction. 1388 */ 1389 void 1390 fs_generator::generate_set_sample_id(fs_inst *inst, 1391 struct brw_reg dst, 1392 struct brw_reg src0, 1393 struct brw_reg src1) 1394 { 1395 assert(dst.type == BRW_REGISTER_TYPE_D || 1396 dst.type == BRW_REGISTER_TYPE_UD); 1397 assert(src0.type == BRW_REGISTER_TYPE_D || 1398 src0.type == BRW_REGISTER_TYPE_UD); 1399 1400 struct brw_reg reg = stride(src1, 1, 4, 0); 1401 if (devinfo->gen >= 8 || inst->exec_size == 8) { 1402 brw_ADD(p, dst, src0, reg); 1403 } else if (inst->exec_size == 16) { 1404 brw_push_insn_state(p); 1405 brw_set_default_exec_size(p, BRW_EXECUTE_8); 1406 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); 1407 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg); 1408 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF); 1409 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2)); 1410 brw_pop_insn_state(p); 1411 } 1412 } 1413 1414 void 1415 fs_generator::generate_pack_half_2x16_split(fs_inst *inst, 1416 struct brw_reg dst, 1417 struct brw_reg x, 1418 struct brw_reg y) 1419 { 1420 assert(devinfo->gen >= 7); 1421 assert(dst.type == BRW_REGISTER_TYPE_UD); 1422 assert(x.type == BRW_REGISTER_TYPE_F); 1423 assert(y.type == BRW_REGISTER_TYPE_F); 1424 1425 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 1426 * 1427 * Because this instruction does not have a 16-bit floating-point type, 1428 * the destination data type must be Word (W). 1429 * 1430 * The destination must be DWord-aligned and specify a horizontal stride 1431 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 1432 * each destination channel and the upper word is not modified. 1433 */ 1434 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2); 1435 1436 /* Give each 32-bit channel of dst the form below, where "." means 1437 * unchanged. 1438 * 0x....hhhh 1439 */ 1440 brw_F32TO16(p, dst_w, y); 1441 1442 /* Now the form: 1443 * 0xhhhh0000 1444 */ 1445 brw_SHL(p, dst, dst, brw_imm_ud(16u)); 1446 1447 /* And, finally the form of packHalf2x16's output: 1448 * 0xhhhhllll 1449 */ 1450 brw_F32TO16(p, dst_w, x); 1451 } 1452 1453 void 1454 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst, 1455 struct brw_reg dst, 1456 struct brw_reg src) 1457 { 1458 assert(devinfo->gen >= 7); 1459 assert(dst.type == BRW_REGISTER_TYPE_F); 1460 assert(src.type == BRW_REGISTER_TYPE_UD); 1461 1462 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 1463 * 1464 * Because this instruction does not have a 16-bit floating-point type, 1465 * the source data type must be Word (W). The destination type must be 1466 * F (Float). 1467 */ 1468 struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2); 1469 1470 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll. 1471 * For the Y case, we wish to access only the upper word; therefore 1472 * a 16-bit subregister offset is needed. 1473 */ 1474 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X || 1475 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y); 1476 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y) 1477 src_w.subnr += 2; 1478 1479 brw_F16TO32(p, dst, src_w); 1480 } 1481 1482 void 1483 fs_generator::generate_shader_time_add(fs_inst *inst, 1484 struct brw_reg payload, 1485 struct brw_reg offset, 1486 struct brw_reg value) 1487 { 1488 assert(devinfo->gen >= 7); 1489 brw_push_insn_state(p); 1490 brw_set_default_mask_control(p, true); 1491 1492 assert(payload.file == BRW_GENERAL_REGISTER_FILE); 1493 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0), 1494 offset.type); 1495 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0), 1496 value.type); 1497 1498 assert(offset.file == BRW_IMMEDIATE_VALUE); 1499 if (value.file == BRW_GENERAL_REGISTER_FILE) { 1500 value.width = BRW_WIDTH_1; 1501 value.hstride = BRW_HORIZONTAL_STRIDE_0; 1502 value.vstride = BRW_VERTICAL_STRIDE_0; 1503 } else { 1504 assert(value.file == BRW_IMMEDIATE_VALUE); 1505 } 1506 1507 /* Trying to deal with setup of the params from the IR is crazy in the FS8 1508 * case, and we don't really care about squeezing every bit of performance 1509 * out of this path, so we just emit the MOVs from here. 1510 */ 1511 brw_MOV(p, payload_offset, offset); 1512 brw_MOV(p, payload_value, value); 1513 brw_shader_time_add(p, payload, 1514 prog_data->binding_table.shader_time_start); 1515 brw_pop_insn_state(p); 1516 1517 brw_mark_surface_used(prog_data, 1518 prog_data->binding_table.shader_time_start); 1519 } 1520 1521 void 1522 fs_generator::enable_debug(const char *shader_name) 1523 { 1524 debug_flag = true; 1525 this->shader_name = shader_name; 1526 } 1527 1528 int 1529 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) 1530 { 1531 /* align to 64 byte boundary. */ 1532 while (p->next_insn_offset % 64) 1533 brw_NOP(p); 1534 1535 this->dispatch_width = dispatch_width; 1536 1537 int start_offset = p->next_insn_offset; 1538 int spill_count = 0, fill_count = 0; 1539 int loop_count = 0; 1540 1541 struct annotation_info annotation; 1542 memset(&annotation, 0, sizeof(annotation)); 1543 1544 foreach_block_and_inst (block, fs_inst, inst, cfg) { 1545 struct brw_reg src[3], dst; 1546 unsigned int last_insn_offset = p->next_insn_offset; 1547 bool multiple_instructions_emitted = false; 1548 1549 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the 1550 * "Register Region Restrictions" section: for BDW, SKL: 1551 * 1552 * "A POW/FDIV operation must not be followed by an instruction 1553 * that requires two destination registers." 1554 * 1555 * The documentation is often lacking annotations for Atom parts, 1556 * and empirically this affects CHV as well. 1557 */ 1558 if (devinfo->gen >= 8 && 1559 p->nr_insn > 1 && 1560 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH && 1561 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW && 1562 inst->dst.component_size(inst->exec_size) > REG_SIZE) { 1563 brw_NOP(p); 1564 last_insn_offset = p->next_insn_offset; 1565 } 1566 1567 if (unlikely(debug_flag)) 1568 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); 1569 1570 /* If the instruction writes to more than one register, it needs to be 1571 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the 1572 * hardware figures out by itself what the right compression mode is, 1573 * but we still need to know whether the instruction is compressed to 1574 * set up the source register regions appropriately. 1575 * 1576 * XXX - This is wrong for instructions that write a single register but 1577 * read more than one which should strictly speaking be treated as 1578 * compressed. For instructions that don't write any registers it 1579 * relies on the destination being a null register of the correct 1580 * type and regioning so the instruction is considered compressed 1581 * or not accordingly. 1582 */ 1583 const bool compressed = 1584 inst->dst.component_size(inst->exec_size) > REG_SIZE; 1585 brw_set_default_compression(p, compressed); 1586 brw_set_default_group(p, inst->group); 1587 1588 for (unsigned int i = 0; i < inst->sources; i++) { 1589 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen, 1590 compressed); 1591 1592 /* The accumulator result appears to get used for the 1593 * conditional modifier generation. When negating a UD 1594 * value, there is a 33rd bit generated for the sign in the 1595 * accumulator value, so now you can't check, for example, 1596 * equality with a 32-bit value. See piglit fs-op-neg-uvec4. 1597 */ 1598 assert(!inst->conditional_mod || 1599 inst->src[i].type != BRW_REGISTER_TYPE_UD || 1600 !inst->src[i].negate); 1601 } 1602 dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed); 1603 1604 brw_set_default_access_mode(p, BRW_ALIGN_1); 1605 brw_set_default_predicate_control(p, inst->predicate); 1606 brw_set_default_predicate_inverse(p, inst->predicate_inverse); 1607 brw_set_default_flag_reg(p, 0, inst->flag_subreg); 1608 brw_set_default_saturate(p, inst->saturate); 1609 brw_set_default_mask_control(p, inst->force_writemask_all); 1610 brw_set_default_acc_write_control(p, inst->writes_accumulator); 1611 brw_set_default_exec_size(p, cvt(inst->exec_size) - 1); 1612 1613 assert(inst->force_writemask_all || inst->exec_size >= 4); 1614 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0); 1615 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); 1616 assert(inst->mlen <= BRW_MAX_MSG_LENGTH); 1617 1618 switch (inst->opcode) { 1619 case BRW_OPCODE_MOV: 1620 brw_MOV(p, dst, src[0]); 1621 break; 1622 case BRW_OPCODE_ADD: 1623 brw_ADD(p, dst, src[0], src[1]); 1624 break; 1625 case BRW_OPCODE_MUL: 1626 brw_MUL(p, dst, src[0], src[1]); 1627 break; 1628 case BRW_OPCODE_AVG: 1629 brw_AVG(p, dst, src[0], src[1]); 1630 break; 1631 case BRW_OPCODE_MACH: 1632 brw_MACH(p, dst, src[0], src[1]); 1633 break; 1634 1635 case BRW_OPCODE_LINE: 1636 brw_LINE(p, dst, src[0], src[1]); 1637 break; 1638 1639 case BRW_OPCODE_MAD: 1640 assert(devinfo->gen >= 6); 1641 brw_set_default_access_mode(p, BRW_ALIGN_16); 1642 brw_MAD(p, dst, src[0], src[1], src[2]); 1643 break; 1644 1645 case BRW_OPCODE_LRP: 1646 assert(devinfo->gen >= 6); 1647 brw_set_default_access_mode(p, BRW_ALIGN_16); 1648 brw_LRP(p, dst, src[0], src[1], src[2]); 1649 break; 1650 1651 case BRW_OPCODE_FRC: 1652 brw_FRC(p, dst, src[0]); 1653 break; 1654 case BRW_OPCODE_RNDD: 1655 brw_RNDD(p, dst, src[0]); 1656 break; 1657 case BRW_OPCODE_RNDE: 1658 brw_RNDE(p, dst, src[0]); 1659 break; 1660 case BRW_OPCODE_RNDZ: 1661 brw_RNDZ(p, dst, src[0]); 1662 break; 1663 1664 case BRW_OPCODE_AND: 1665 brw_AND(p, dst, src[0], src[1]); 1666 break; 1667 case BRW_OPCODE_OR: 1668 brw_OR(p, dst, src[0], src[1]); 1669 break; 1670 case BRW_OPCODE_XOR: 1671 brw_XOR(p, dst, src[0], src[1]); 1672 break; 1673 case BRW_OPCODE_NOT: 1674 brw_NOT(p, dst, src[0]); 1675 break; 1676 case BRW_OPCODE_ASR: 1677 brw_ASR(p, dst, src[0], src[1]); 1678 break; 1679 case BRW_OPCODE_SHR: 1680 brw_SHR(p, dst, src[0], src[1]); 1681 break; 1682 case BRW_OPCODE_SHL: 1683 brw_SHL(p, dst, src[0], src[1]); 1684 break; 1685 case BRW_OPCODE_F32TO16: 1686 assert(devinfo->gen >= 7); 1687 brw_F32TO16(p, dst, src[0]); 1688 break; 1689 case BRW_OPCODE_F16TO32: 1690 assert(devinfo->gen >= 7); 1691 brw_F16TO32(p, dst, src[0]); 1692 break; 1693 case BRW_OPCODE_CMP: 1694 if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell && 1695 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) { 1696 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround 1697 * implemented in the compiler is not sufficient. Overriding the 1698 * type when the destination is the null register is necessary but 1699 * not sufficient by itself. 1700 */ 1701 assert(dst.nr == BRW_ARF_NULL); 1702 dst.type = BRW_REGISTER_TYPE_D; 1703 } 1704 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 1705 break; 1706 case BRW_OPCODE_SEL: 1707 brw_SEL(p, dst, src[0], src[1]); 1708 break; 1709 case BRW_OPCODE_BFREV: 1710 assert(devinfo->gen >= 7); 1711 /* BFREV only supports UD type for src and dst. */ 1712 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), 1713 retype(src[0], BRW_REGISTER_TYPE_UD)); 1714 break; 1715 case BRW_OPCODE_FBH: 1716 assert(devinfo->gen >= 7); 1717 /* FBH only supports UD type for dst. */ 1718 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); 1719 break; 1720 case BRW_OPCODE_FBL: 1721 assert(devinfo->gen >= 7); 1722 /* FBL only supports UD type for dst. */ 1723 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); 1724 break; 1725 case BRW_OPCODE_LZD: 1726 brw_LZD(p, dst, src[0]); 1727 break; 1728 case BRW_OPCODE_CBIT: 1729 assert(devinfo->gen >= 7); 1730 /* CBIT only supports UD type for dst. */ 1731 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]); 1732 break; 1733 case BRW_OPCODE_ADDC: 1734 assert(devinfo->gen >= 7); 1735 brw_ADDC(p, dst, src[0], src[1]); 1736 break; 1737 case BRW_OPCODE_SUBB: 1738 assert(devinfo->gen >= 7); 1739 brw_SUBB(p, dst, src[0], src[1]); 1740 break; 1741 case BRW_OPCODE_MAC: 1742 brw_MAC(p, dst, src[0], src[1]); 1743 break; 1744 1745 case BRW_OPCODE_BFE: 1746 assert(devinfo->gen >= 7); 1747 brw_set_default_access_mode(p, BRW_ALIGN_16); 1748 brw_BFE(p, dst, src[0], src[1], src[2]); 1749 break; 1750 1751 case BRW_OPCODE_BFI1: 1752 assert(devinfo->gen >= 7); 1753 brw_BFI1(p, dst, src[0], src[1]); 1754 break; 1755 case BRW_OPCODE_BFI2: 1756 assert(devinfo->gen >= 7); 1757 brw_set_default_access_mode(p, BRW_ALIGN_16); 1758 brw_BFI2(p, dst, src[0], src[1], src[2]); 1759 break; 1760 1761 case BRW_OPCODE_IF: 1762 if (inst->src[0].file != BAD_FILE) { 1763 /* The instruction has an embedded compare (only allowed on gen6) */ 1764 assert(devinfo->gen == 6); 1765 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 1766 } else { 1767 brw_IF(p, brw_inst_exec_size(devinfo, p->current)); 1768 } 1769 break; 1770 1771 case BRW_OPCODE_ELSE: 1772 brw_ELSE(p); 1773 break; 1774 case BRW_OPCODE_ENDIF: 1775 brw_ENDIF(p); 1776 break; 1777 1778 case BRW_OPCODE_DO: 1779 brw_DO(p, brw_inst_exec_size(devinfo, p->current)); 1780 break; 1781 1782 case BRW_OPCODE_BREAK: 1783 brw_BREAK(p); 1784 break; 1785 case BRW_OPCODE_CONTINUE: 1786 brw_CONT(p); 1787 break; 1788 1789 case BRW_OPCODE_WHILE: 1790 brw_WHILE(p); 1791 loop_count++; 1792 break; 1793 1794 case SHADER_OPCODE_RCP: 1795 case SHADER_OPCODE_RSQ: 1796 case SHADER_OPCODE_SQRT: 1797 case SHADER_OPCODE_EXP2: 1798 case SHADER_OPCODE_LOG2: 1799 case SHADER_OPCODE_SIN: 1800 case SHADER_OPCODE_COS: 1801 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1802 if (devinfo->gen >= 6) { 1803 assert(inst->mlen == 0); 1804 assert(devinfo->gen >= 7 || inst->exec_size == 8); 1805 gen6_math(p, dst, brw_math_function(inst->opcode), 1806 src[0], brw_null_reg()); 1807 } else { 1808 assert(inst->mlen >= 1); 1809 assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8); 1810 gen4_math(p, dst, 1811 brw_math_function(inst->opcode), 1812 inst->base_mrf, src[0], 1813 BRW_MATH_PRECISION_FULL); 1814 } 1815 break; 1816 case SHADER_OPCODE_INT_QUOTIENT: 1817 case SHADER_OPCODE_INT_REMAINDER: 1818 case SHADER_OPCODE_POW: 1819 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1820 if (devinfo->gen >= 6) { 1821 assert(inst->mlen == 0); 1822 assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) || 1823 inst->exec_size == 8); 1824 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); 1825 } else { 1826 assert(inst->mlen >= 1); 1827 assert(inst->exec_size == 8); 1828 gen4_math(p, dst, brw_math_function(inst->opcode), 1829 inst->base_mrf, src[0], 1830 BRW_MATH_PRECISION_FULL); 1831 } 1832 break; 1833 case FS_OPCODE_CINTERP: 1834 brw_MOV(p, dst, src[0]); 1835 break; 1836 case FS_OPCODE_LINTERP: 1837 generate_linterp(inst, dst, src); 1838 break; 1839 case FS_OPCODE_PIXEL_X: 1840 assert(src[0].type == BRW_REGISTER_TYPE_UW); 1841 src[0].subnr = 0 * type_sz(src[0].type); 1842 brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 1843 break; 1844 case FS_OPCODE_PIXEL_Y: 1845 assert(src[0].type == BRW_REGISTER_TYPE_UW); 1846 src[0].subnr = 4 * type_sz(src[0].type); 1847 brw_MOV(p, dst, stride(src[0], 8, 4, 1)); 1848 break; 1849 case FS_OPCODE_GET_BUFFER_SIZE: 1850 generate_get_buffer_size(inst, dst, src[0], src[1]); 1851 break; 1852 case SHADER_OPCODE_TEX: 1853 case FS_OPCODE_TXB: 1854 case SHADER_OPCODE_TXD: 1855 case SHADER_OPCODE_TXF: 1856 case SHADER_OPCODE_TXF_LZ: 1857 case SHADER_OPCODE_TXF_CMS: 1858 case SHADER_OPCODE_TXF_CMS_W: 1859 case SHADER_OPCODE_TXF_UMS: 1860 case SHADER_OPCODE_TXF_MCS: 1861 case SHADER_OPCODE_TXL: 1862 case SHADER_OPCODE_TXL_LZ: 1863 case SHADER_OPCODE_TXS: 1864 case SHADER_OPCODE_LOD: 1865 case SHADER_OPCODE_TG4: 1866 case SHADER_OPCODE_TG4_OFFSET: 1867 case SHADER_OPCODE_SAMPLEINFO: 1868 generate_tex(inst, dst, src[0], src[1], src[2]); 1869 break; 1870 case FS_OPCODE_DDX_COARSE: 1871 case FS_OPCODE_DDX_FINE: 1872 generate_ddx(inst->opcode, dst, src[0]); 1873 break; 1874 case FS_OPCODE_DDY_COARSE: 1875 case FS_OPCODE_DDY_FINE: 1876 generate_ddy(inst->opcode, dst, src[0]); 1877 break; 1878 1879 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 1880 generate_scratch_write(inst, src[0]); 1881 spill_count++; 1882 break; 1883 1884 case SHADER_OPCODE_GEN4_SCRATCH_READ: 1885 generate_scratch_read(inst, dst); 1886 fill_count++; 1887 break; 1888 1889 case SHADER_OPCODE_GEN7_SCRATCH_READ: 1890 generate_scratch_read_gen7(inst, dst); 1891 fill_count++; 1892 break; 1893 1894 case SHADER_OPCODE_MOV_INDIRECT: 1895 generate_mov_indirect(inst, dst, src[0], src[1]); 1896 break; 1897 1898 case SHADER_OPCODE_URB_READ_SIMD8: 1899 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: 1900 generate_urb_read(inst, dst, src[0]); 1901 break; 1902 1903 case SHADER_OPCODE_URB_WRITE_SIMD8: 1904 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: 1905 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: 1906 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: 1907 generate_urb_write(inst, src[0]); 1908 break; 1909 1910 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 1911 assert(inst->force_writemask_all); 1912 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]); 1913 break; 1914 1915 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: 1916 assert(inst->force_writemask_all); 1917 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]); 1918 break; 1919 1920 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: 1921 generate_varying_pull_constant_load_gen4(inst, dst, src[0]); 1922 break; 1923 1924 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: 1925 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]); 1926 break; 1927 1928 case FS_OPCODE_REP_FB_WRITE: 1929 case FS_OPCODE_FB_WRITE: 1930 generate_fb_write(inst, src[0]); 1931 break; 1932 1933 case FS_OPCODE_FB_READ: 1934 generate_fb_read(inst, dst, src[0]); 1935 break; 1936 1937 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS: 1938 generate_mov_dispatch_to_flags(inst); 1939 break; 1940 1941 case FS_OPCODE_DISCARD_JUMP: 1942 generate_discard_jump(inst); 1943 break; 1944 1945 case SHADER_OPCODE_SHADER_TIME_ADD: 1946 generate_shader_time_add(inst, src[0], src[1], src[2]); 1947 break; 1948 1949 case SHADER_OPCODE_UNTYPED_ATOMIC: 1950 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1951 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, 1952 inst->mlen, !inst->dst.is_null()); 1953 break; 1954 1955 case SHADER_OPCODE_UNTYPED_SURFACE_READ: 1956 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1957 brw_untyped_surface_read(p, dst, src[0], src[1], 1958 inst->mlen, src[2].ud); 1959 break; 1960 1961 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: 1962 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1963 brw_untyped_surface_write(p, src[0], src[1], 1964 inst->mlen, src[2].ud); 1965 break; 1966 1967 case SHADER_OPCODE_TYPED_ATOMIC: 1968 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1969 brw_typed_atomic(p, dst, src[0], src[1], 1970 src[2].ud, inst->mlen, !inst->dst.is_null()); 1971 break; 1972 1973 case SHADER_OPCODE_TYPED_SURFACE_READ: 1974 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1975 brw_typed_surface_read(p, dst, src[0], src[1], 1976 inst->mlen, src[2].ud); 1977 break; 1978 1979 case SHADER_OPCODE_TYPED_SURFACE_WRITE: 1980 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1981 brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud); 1982 break; 1983 1984 case SHADER_OPCODE_MEMORY_FENCE: 1985 brw_memory_fence(p, dst); 1986 break; 1987 1988 case SHADER_OPCODE_FIND_LIVE_CHANNEL: { 1989 const struct brw_reg mask = 1990 brw_stage_has_packed_dispatch(devinfo, stage, 1991 prog_data) ? brw_imm_ud(~0u) : 1992 stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : 1993 brw_dmask_reg(); 1994 brw_find_live_channel(p, dst, mask); 1995 break; 1996 } 1997 1998 case SHADER_OPCODE_BROADCAST: 1999 assert(inst->force_writemask_all); 2000 brw_broadcast(p, dst, src[0], src[1]); 2001 break; 2002 2003 case FS_OPCODE_SET_SAMPLE_ID: 2004 generate_set_sample_id(inst, dst, src[0], src[1]); 2005 break; 2006 2007 case FS_OPCODE_PACK_HALF_2x16_SPLIT: 2008 generate_pack_half_2x16_split(inst, dst, src[0], src[1]); 2009 break; 2010 2011 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: 2012 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: 2013 generate_unpack_half_2x16_split(inst, dst, src[0]); 2014 break; 2015 2016 case FS_OPCODE_PLACEHOLDER_HALT: 2017 /* This is the place where the final HALT needs to be inserted if 2018 * we've emitted any discards. If not, this will emit no code. 2019 */ 2020 if (!patch_discard_jumps_to_fb_writes()) { 2021 if (unlikely(debug_flag)) { 2022 annotation.ann_count--; 2023 } 2024 } 2025 break; 2026 2027 case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 2028 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2029 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE); 2030 break; 2031 2032 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 2033 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2034 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET); 2035 break; 2036 2037 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 2038 generate_pixel_interpolator_query(inst, dst, src[0], src[1], 2039 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET); 2040 break; 2041 2042 case CS_OPCODE_CS_TERMINATE: 2043 generate_cs_terminate(inst, src[0]); 2044 break; 2045 2046 case SHADER_OPCODE_BARRIER: 2047 generate_barrier(inst, src[0]); 2048 break; 2049 2050 case BRW_OPCODE_DIM: 2051 assert(devinfo->is_haswell); 2052 assert(src[0].type == BRW_REGISTER_TYPE_DF); 2053 assert(dst.type == BRW_REGISTER_TYPE_DF); 2054 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); 2055 break; 2056 2057 default: 2058 unreachable("Unsupported opcode"); 2059 2060 case SHADER_OPCODE_LOAD_PAYLOAD: 2061 unreachable("Should be lowered by lower_load_payload()"); 2062 } 2063 2064 if (multiple_instructions_emitted) 2065 continue; 2066 2067 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { 2068 assert(p->next_insn_offset == last_insn_offset + 16 || 2069 !"conditional_mod, no_dd_check, or no_dd_clear set for IR " 2070 "emitting more than 1 instruction"); 2071 2072 brw_inst *last = &p->store[last_insn_offset / 16]; 2073 2074 if (inst->conditional_mod) 2075 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); 2076 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); 2077 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); 2078 } 2079 } 2080 2081 brw_set_uip_jip(p, start_offset); 2082 annotation_finalize(&annotation, p->next_insn_offset); 2083 2084 #ifndef NDEBUG 2085 bool validated = brw_validate_instructions(p, start_offset, &annotation); 2086 #else 2087 if (unlikely(debug_flag)) 2088 brw_validate_instructions(p, start_offset, &annotation); 2089 #endif 2090 2091 int before_size = p->next_insn_offset - start_offset; 2092 brw_compact_instructions(p, start_offset, annotation.ann_count, 2093 annotation.ann); 2094 int after_size = p->next_insn_offset - start_offset; 2095 2096 if (unlikely(debug_flag)) { 2097 fprintf(stderr, "Native code for %s\n" 2098 "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" 2099 " bytes (%.0f%%)\n", 2100 shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count, 2101 spill_count, fill_count, promoted_constants, before_size, after_size, 2102 100.0f * (before_size - after_size) / before_size); 2103 2104 dump_assembly(p->store, annotation.ann_count, annotation.ann, 2105 p->devinfo); 2106 ralloc_free(annotation.mem_ctx); 2107 } 2108 assert(validated); 2109 2110 compiler->shader_debug_log(log_data, 2111 "%s SIMD%d shader: %d inst, %d loops, %u cycles, " 2112 "%d:%d spills:fills, Promoted %u constants, " 2113 "compacted %d to %d bytes.", 2114 _mesa_shader_stage_to_abbrev(stage), 2115 dispatch_width, before_size / 16, 2116 loop_count, cfg->cycle_count, spill_count, 2117 fill_count, promoted_constants, before_size, 2118 after_size); 2119 2120 return start_offset; 2121 } 2122 2123 const unsigned * 2124 fs_generator::get_assembly(unsigned int *assembly_size) 2125 { 2126 return brw_get_program(p, assembly_size); 2127 } 2128