1 /* 2 * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 #include "r600_sq.h" 24 #include "r600_formats.h" 25 #include "r600_opcodes.h" 26 #include "r600_shader.h" 27 #include "r600d.h" 28 29 #include "sb/sb_public.h" 30 31 #include "pipe/p_shader_tokens.h" 32 #include "tgsi/tgsi_info.h" 33 #include "tgsi/tgsi_parse.h" 34 #include "tgsi/tgsi_scan.h" 35 #include "tgsi/tgsi_dump.h" 36 #include "util/u_bitcast.h" 37 #include "util/u_memory.h" 38 #include "util/u_math.h" 39 #include <stdio.h> 40 #include <errno.h> 41 42 /* CAYMAN notes 43 Why CAYMAN got loops for lots of instructions is explained here. 44 45 -These 8xx t-slot only ops are implemented in all vector slots. 46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT 47 These 8xx t-slot only opcodes become vector ops, with all four 48 slots expecting the arguments on sources a and b. Result is 49 broadcast to all channels. 50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64 51 These 8xx t-slot only opcodes become vector ops in the z, y, and 52 x slots. 53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64 55 SQRT_IEEE/_64 56 SIN/COS 57 The w slot may have an independent co-issued operation, or if the 58 result is required to be in the w slot, the opcode above may be 59 issued in the w slot as well. 60 The compiler must issue the source argument to slots z, y, and x 61 */ 62 63 /* Contents of r0 on entry to various shaders 64 65 VS - .x = VertexID 66 .y = RelVertexID (??) 67 .w = InstanceID 68 69 GS - r0.xyw, r1.xyz = per-vertex offsets 70 r0.z = PrimitiveID 71 72 TCS - .x = PatchID 73 .y = RelPatchID (??) 74 .z = InvocationID 75 .w = tess factor base. 76 77 TES - .x = TessCoord.x 78 - .y = TessCoord.y 79 - .z = RelPatchID (??) 80 - .w = PrimitiveID 81 82 PS - face_gpr.z = SampleMask 83 face_gpr.w = SampleID 84 */ 85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16) 86 static int r600_shader_from_tgsi(struct r600_context *rctx, 87 struct r600_pipe_shader *pipeshader, 88 union r600_shader_key key); 89 90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, 91 int size, unsigned comp_mask) { 92 93 if (!size) 94 return; 95 96 if (ps->num_arrays == ps->max_arrays) { 97 ps->max_arrays += 64; 98 ps->arrays = realloc(ps->arrays, ps->max_arrays * 99 sizeof(struct r600_shader_array)); 100 } 101 102 int n = ps->num_arrays; 103 ++ps->num_arrays; 104 105 ps->arrays[n].comp_mask = comp_mask; 106 ps->arrays[n].gpr_start = start_gpr; 107 ps->arrays[n].gpr_count = size; 108 } 109 110 static void r600_dump_streamout(struct pipe_stream_output_info *so) 111 { 112 unsigned i; 113 114 fprintf(stderr, "STREAMOUT\n"); 115 for (i = 0; i < so->num_outputs; i++) { 116 unsigned mask = ((1 << so->output[i].num_components) - 1) << 117 so->output[i].start_component; 118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", 119 i, 120 so->output[i].stream, 121 so->output[i].output_buffer, 122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 123 so->output[i].register_index, 124 mask & 1 ? "x" : "", 125 mask & 2 ? "y" : "", 126 mask & 4 ? "z" : "", 127 mask & 8 ? "w" : "", 128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : ""); 129 } 130 } 131 132 static int store_shader(struct pipe_context *ctx, 133 struct r600_pipe_shader *shader) 134 { 135 struct r600_context *rctx = (struct r600_context *)ctx; 136 uint32_t *ptr, i; 137 138 if (shader->bo == NULL) { 139 shader->bo = (struct r600_resource*) 140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); 141 if (shader->bo == NULL) { 142 return -ENOMEM; 143 } 144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE); 145 if (R600_BIG_ENDIAN) { 146 for (i = 0; i < shader->shader.bc.ndw; ++i) { 147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]); 148 } 149 } else { 150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); 151 } 152 rctx->b.ws->buffer_unmap(shader->bo->buf); 153 } 154 155 return 0; 156 } 157 158 int r600_pipe_shader_create(struct pipe_context *ctx, 159 struct r600_pipe_shader *shader, 160 union r600_shader_key key) 161 { 162 struct r600_context *rctx = (struct r600_context *)ctx; 163 struct r600_pipe_shader_selector *sel = shader->selector; 164 int r; 165 bool dump = r600_can_dump_shader(&rctx->screen->b, 166 tgsi_get_processor_type(sel->tokens)); 167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB); 168 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 169 unsigned export_shader; 170 171 shader->shader.bc.isa = rctx->isa; 172 173 if (dump) { 174 fprintf(stderr, "--------------------------------------------------------------\n"); 175 tgsi_dump(sel->tokens, 0); 176 177 if (sel->so.num_outputs) { 178 r600_dump_streamout(&sel->so); 179 } 180 } 181 r = r600_shader_from_tgsi(rctx, shader, key); 182 if (r) { 183 R600_ERR("translation from TGSI failed !\n"); 184 goto error; 185 } 186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) { 187 /* only disable for vertex shaders in tess paths */ 188 if (key.vs.as_ls) 189 use_sb = 0; 190 } 191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL); 192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL); 193 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE); 194 195 /* disable SB for shaders using doubles */ 196 use_sb &= !shader->shader.uses_doubles; 197 198 use_sb &= !shader->shader.uses_atomics; 199 use_sb &= !shader->shader.uses_images; 200 201 /* Check if the bytecode has already been built. */ 202 if (!shader->shader.bc.bytecode) { 203 r = r600_bytecode_build(&shader->shader.bc); 204 if (r) { 205 R600_ERR("building bytecode failed !\n"); 206 goto error; 207 } 208 } 209 210 if (dump && !sb_disasm) { 211 fprintf(stderr, "--------------------------------------------------------------\n"); 212 r600_bytecode_disasm(&shader->shader.bc); 213 fprintf(stderr, "______________________________________________________________\n"); 214 } else if ((dump && sb_disasm) || use_sb) { 215 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader, 216 dump, use_sb); 217 if (r) { 218 R600_ERR("r600_sb_bytecode_process failed !\n"); 219 goto error; 220 } 221 } 222 223 if (shader->gs_copy_shader) { 224 if (dump) { 225 // dump copy shader 226 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc, 227 &shader->gs_copy_shader->shader, dump, 0); 228 if (r) 229 goto error; 230 } 231 232 if ((r = store_shader(ctx, shader->gs_copy_shader))) 233 goto error; 234 } 235 236 /* Store the shader in a buffer. */ 237 if ((r = store_shader(ctx, shader))) 238 goto error; 239 240 /* Build state. */ 241 switch (shader->shader.processor_type) { 242 case PIPE_SHADER_TESS_CTRL: 243 evergreen_update_hs_state(ctx, shader); 244 break; 245 case PIPE_SHADER_TESS_EVAL: 246 if (key.tes.as_es) 247 evergreen_update_es_state(ctx, shader); 248 else 249 evergreen_update_vs_state(ctx, shader); 250 break; 251 case PIPE_SHADER_GEOMETRY: 252 if (rctx->b.chip_class >= EVERGREEN) { 253 evergreen_update_gs_state(ctx, shader); 254 evergreen_update_vs_state(ctx, shader->gs_copy_shader); 255 } else { 256 r600_update_gs_state(ctx, shader); 257 r600_update_vs_state(ctx, shader->gs_copy_shader); 258 } 259 break; 260 case PIPE_SHADER_VERTEX: 261 export_shader = key.vs.as_es; 262 if (rctx->b.chip_class >= EVERGREEN) { 263 if (key.vs.as_ls) 264 evergreen_update_ls_state(ctx, shader); 265 else if (key.vs.as_es) 266 evergreen_update_es_state(ctx, shader); 267 else 268 evergreen_update_vs_state(ctx, shader); 269 } else { 270 if (export_shader) 271 r600_update_es_state(ctx, shader); 272 else 273 r600_update_vs_state(ctx, shader); 274 } 275 break; 276 case PIPE_SHADER_FRAGMENT: 277 if (rctx->b.chip_class >= EVERGREEN) { 278 evergreen_update_ps_state(ctx, shader); 279 } else { 280 r600_update_ps_state(ctx, shader); 281 } 282 break; 283 case PIPE_SHADER_COMPUTE: 284 evergreen_update_ls_state(ctx, shader); 285 break; 286 default: 287 r = -EINVAL; 288 goto error; 289 } 290 return 0; 291 292 error: 293 r600_pipe_shader_destroy(ctx, shader); 294 return r; 295 } 296 297 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader) 298 { 299 r600_resource_reference(&shader->bo, NULL); 300 r600_bytecode_clear(&shader->shader.bc); 301 r600_release_command_buffer(&shader->command_buffer); 302 } 303 304 /* 305 * tgsi -> r600 shader 306 */ 307 struct r600_shader_tgsi_instruction; 308 309 struct r600_shader_src { 310 unsigned sel; 311 unsigned swizzle[4]; 312 unsigned neg; 313 unsigned abs; 314 unsigned rel; 315 unsigned kc_bank; 316 boolean kc_rel; /* true if cache bank is indexed */ 317 uint32_t value[4]; 318 }; 319 320 struct eg_interp { 321 boolean enabled; 322 unsigned ij_index; 323 }; 324 325 struct r600_shader_ctx { 326 struct tgsi_shader_info info; 327 struct tgsi_parse_context parse; 328 const struct tgsi_token *tokens; 329 unsigned type; 330 unsigned file_offset[TGSI_FILE_COUNT]; 331 unsigned temp_reg; 332 const struct r600_shader_tgsi_instruction *inst_info; 333 struct r600_bytecode *bc; 334 struct r600_shader *shader; 335 struct r600_shader_src src[4]; 336 uint32_t *literals; 337 uint32_t nliterals; 338 uint32_t max_driver_temp_used; 339 /* needed for evergreen interpolation */ 340 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid 341 /* evergreen/cayman also store sample mask in face register */ 342 int face_gpr; 343 /* sample id is .w component stored in fixed point position register */ 344 int fixed_pt_position_gpr; 345 int colors_used; 346 boolean clip_vertex_write; 347 unsigned cv_output; 348 unsigned edgeflag_output; 349 int cs_block_size_reg; 350 int cs_grid_size_reg; 351 bool cs_block_size_loaded, cs_grid_size_loaded; 352 int fragcoord_input; 353 int next_ring_offset; 354 int gs_out_ring_offset; 355 int gs_next_vertex; 356 struct r600_shader *gs_for_vs; 357 int gs_export_gpr_tregs[4]; 358 int gs_rotated_input[2]; 359 const struct pipe_stream_output_info *gs_stream_output_info; 360 unsigned enabled_stream_buffers_mask; 361 unsigned tess_input_info; /* temp with tess input offsets */ 362 unsigned tess_output_info; /* temp with tess input offsets */ 363 unsigned thread_id_gpr; /* temp with thread id calculated for images */ 364 bool thread_id_gpr_loaded; 365 }; 366 367 struct r600_shader_tgsi_instruction { 368 unsigned op; 369 int (*process)(struct r600_shader_ctx *ctx); 370 }; 371 372 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind); 373 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; 374 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); 375 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason); 376 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type); 377 static int tgsi_else(struct r600_shader_ctx *ctx); 378 static int tgsi_endif(struct r600_shader_ctx *ctx); 379 static int tgsi_bgnloop(struct r600_shader_ctx *ctx); 380 static int tgsi_endloop(struct r600_shader_ctx *ctx); 381 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx); 382 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 383 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 384 unsigned int dst_reg); 385 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 386 const struct r600_shader_src *shader_src, 387 unsigned chan); 388 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 389 unsigned dst_reg, unsigned mask); 390 391 static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx) 392 { 393 if (ctx->bc->family == CHIP_HEMLOCK || 394 ctx->bc->family == CHIP_CYPRESS || 395 ctx->bc->family == CHIP_JUNIPER) 396 return false; 397 return true; 398 } 399 400 static int tgsi_last_instruction(unsigned writemask) 401 { 402 int i, lasti = 0; 403 404 for (i = 0; i < 4; i++) { 405 if (writemask & (1 << i)) { 406 lasti = i; 407 } 408 } 409 return lasti; 410 } 411 412 static int tgsi_is_supported(struct r600_shader_ctx *ctx) 413 { 414 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; 415 unsigned j; 416 417 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) { 418 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); 419 return -EINVAL; 420 } 421 #if 0 422 if (i->Instruction.Label) { 423 R600_ERR("label unsupported\n"); 424 return -EINVAL; 425 } 426 #endif 427 for (j = 0; j < i->Instruction.NumSrcRegs; j++) { 428 if (i->Src[j].Register.Dimension) { 429 switch (i->Src[j].Register.File) { 430 case TGSI_FILE_CONSTANT: 431 case TGSI_FILE_HW_ATOMIC: 432 break; 433 case TGSI_FILE_INPUT: 434 if (ctx->type == PIPE_SHADER_GEOMETRY || 435 ctx->type == PIPE_SHADER_TESS_CTRL || 436 ctx->type == PIPE_SHADER_TESS_EVAL) 437 break; 438 case TGSI_FILE_OUTPUT: 439 if (ctx->type == PIPE_SHADER_TESS_CTRL) 440 break; 441 default: 442 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j, 443 i->Src[j].Register.File, 444 i->Src[j].Register.Dimension); 445 return -EINVAL; 446 } 447 } 448 } 449 for (j = 0; j < i->Instruction.NumDstRegs; j++) { 450 if (i->Dst[j].Register.Dimension) { 451 if (ctx->type == PIPE_SHADER_TESS_CTRL) 452 continue; 453 R600_ERR("unsupported dst (dimension)\n"); 454 return -EINVAL; 455 } 456 } 457 return 0; 458 } 459 460 int eg_get_interpolator_index(unsigned interpolate, unsigned location) 461 { 462 if (interpolate == TGSI_INTERPOLATE_COLOR || 463 interpolate == TGSI_INTERPOLATE_LINEAR || 464 interpolate == TGSI_INTERPOLATE_PERSPECTIVE) 465 { 466 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR; 467 int loc; 468 469 switch(location) { 470 case TGSI_INTERPOLATE_LOC_CENTER: 471 loc = 1; 472 break; 473 case TGSI_INTERPOLATE_LOC_CENTROID: 474 loc = 2; 475 break; 476 case TGSI_INTERPOLATE_LOC_SAMPLE: 477 default: 478 loc = 0; break; 479 } 480 481 return is_linear * 3 + loc; 482 } 483 484 return -1; 485 } 486 487 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx, 488 int input) 489 { 490 int i = eg_get_interpolator_index( 491 ctx->shader->input[input].interpolate, 492 ctx->shader->input[input].interpolate_location); 493 assert(i >= 0); 494 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index; 495 } 496 497 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input) 498 { 499 int i, r; 500 struct r600_bytecode_alu alu; 501 int gpr = 0, base_chan = 0; 502 int ij_index = ctx->shader->input[input].ij_index; 503 504 /* work out gpr and base_chan from index */ 505 gpr = ij_index / 2; 506 base_chan = (2 * (ij_index % 2)) + 1; 507 508 for (i = 0; i < 8; i++) { 509 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 510 511 if (i < 4) 512 alu.op = ALU_OP2_INTERP_ZW; 513 else 514 alu.op = ALU_OP2_INTERP_XY; 515 516 if ((i > 1) && (i < 6)) { 517 alu.dst.sel = ctx->shader->input[input].gpr; 518 alu.dst.write = 1; 519 } 520 521 alu.dst.chan = i % 4; 522 523 alu.src[0].sel = gpr; 524 alu.src[0].chan = (base_chan - (i % 2)); 525 526 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 527 528 alu.bank_swizzle_force = SQ_ALU_VEC_210; 529 if ((i % 4) == 3) 530 alu.last = 1; 531 r = r600_bytecode_add_alu(ctx->bc, &alu); 532 if (r) 533 return r; 534 } 535 return 0; 536 } 537 538 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input) 539 { 540 int i, r; 541 struct r600_bytecode_alu alu; 542 543 for (i = 0; i < 4; i++) { 544 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 545 546 alu.op = ALU_OP1_INTERP_LOAD_P0; 547 548 alu.dst.sel = ctx->shader->input[input].gpr; 549 alu.dst.write = 1; 550 551 alu.dst.chan = i; 552 553 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 554 alu.src[0].chan = i; 555 556 if (i == 3) 557 alu.last = 1; 558 r = r600_bytecode_add_alu(ctx->bc, &alu); 559 if (r) 560 return r; 561 } 562 return 0; 563 } 564 565 /* 566 * Special export handling in shaders 567 * 568 * shader export ARRAY_BASE for EXPORT_POS: 569 * 60 is position 570 * 61 is misc vector 571 * 62, 63 are clip distance vectors 572 * 573 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL: 574 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61 575 * USE_VTX_POINT_SIZE - point size in the X channel of export 61 576 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61 577 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61 578 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61 579 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually 580 * exclusive from render target index) 581 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors 582 * 583 * 584 * shader export ARRAY_BASE for EXPORT_PIXEL: 585 * 0-7 CB targets 586 * 61 computed Z vector 587 * 588 * The use of the values exported in the computed Z vector are controlled 589 * by DB_SHADER_CONTROL: 590 * Z_EXPORT_ENABLE - Z as a float in RED 591 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN 592 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA 593 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE 594 * DB_SOURCE_FORMAT - export control restrictions 595 * 596 */ 597 598 599 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */ 600 static int r600_spi_sid(struct r600_shader_io * io) 601 { 602 int index, name = io->name; 603 604 /* These params are handled differently, they don't need 605 * semantic indices, so we'll use 0 for them. 606 */ 607 if (name == TGSI_SEMANTIC_POSITION || 608 name == TGSI_SEMANTIC_PSIZE || 609 name == TGSI_SEMANTIC_EDGEFLAG || 610 name == TGSI_SEMANTIC_FACE || 611 name == TGSI_SEMANTIC_SAMPLEMASK) 612 index = 0; 613 else { 614 if (name == TGSI_SEMANTIC_GENERIC) { 615 /* For generic params simply use sid from tgsi */ 616 index = io->sid; 617 } else { 618 /* For non-generic params - pack name and sid into 8 bits */ 619 index = 0x80 | (name<<3) | (io->sid); 620 } 621 622 /* Make sure that all really used indices have nonzero value, so 623 * we can just compare it to 0 later instead of comparing the name 624 * with different values to detect special cases. */ 625 index++; 626 } 627 628 return index; 629 }; 630 631 /* we need this to get a common lds index for vs/tcs/tes input/outputs */ 632 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index) 633 { 634 switch (semantic_name) { 635 case TGSI_SEMANTIC_POSITION: 636 return 0; 637 case TGSI_SEMANTIC_PSIZE: 638 return 1; 639 case TGSI_SEMANTIC_CLIPDIST: 640 assert(index <= 1); 641 return 2 + index; 642 case TGSI_SEMANTIC_GENERIC: 643 if (index <= 63-4) 644 return 4 + index - 9; 645 else 646 /* same explanation as in the default statement, 647 * the only user hitting this is st/nine. 648 */ 649 return 0; 650 651 /* patch indices are completely separate and thus start from 0 */ 652 case TGSI_SEMANTIC_TESSOUTER: 653 return 0; 654 case TGSI_SEMANTIC_TESSINNER: 655 return 1; 656 case TGSI_SEMANTIC_PATCH: 657 return 2 + index; 658 659 default: 660 /* Don't fail here. The result of this function is only used 661 * for LS, TCS, TES, and GS, where legacy GL semantics can't 662 * occur, but this function is called for all vertex shaders 663 * before it's known whether LS will be compiled or not. 664 */ 665 return 0; 666 } 667 } 668 669 /* turn input into interpolate on EG */ 670 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) 671 { 672 int r = 0; 673 674 if (ctx->shader->input[index].spi_sid) { 675 ctx->shader->input[index].lds_pos = ctx->shader->nlds++; 676 if (ctx->shader->input[index].interpolate > 0) { 677 evergreen_interp_assign_ij_index(ctx, index); 678 r = evergreen_interp_alu(ctx, index); 679 } else { 680 r = evergreen_interp_flat(ctx, index); 681 } 682 } 683 return r; 684 } 685 686 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back) 687 { 688 struct r600_bytecode_alu alu; 689 int i, r; 690 int gpr_front = ctx->shader->input[front].gpr; 691 int gpr_back = ctx->shader->input[back].gpr; 692 693 for (i = 0; i < 4; i++) { 694 memset(&alu, 0, sizeof(alu)); 695 alu.op = ALU_OP3_CNDGT; 696 alu.is_op3 = 1; 697 alu.dst.write = 1; 698 alu.dst.sel = gpr_front; 699 alu.src[0].sel = ctx->face_gpr; 700 alu.src[1].sel = gpr_front; 701 alu.src[2].sel = gpr_back; 702 703 alu.dst.chan = i; 704 alu.src[1].chan = i; 705 alu.src[2].chan = i; 706 alu.last = (i==3); 707 708 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 709 return r; 710 } 711 712 return 0; 713 } 714 715 /* execute a single slot ALU calculation */ 716 static int single_alu_op2(struct r600_shader_ctx *ctx, int op, 717 int dst_sel, int dst_chan, 718 int src0_sel, unsigned src0_chan_val, 719 int src1_sel, unsigned src1_chan_val) 720 { 721 struct r600_bytecode_alu alu; 722 int r, i; 723 724 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) { 725 for (i = 0; i < 4; i++) { 726 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 727 alu.op = op; 728 alu.src[0].sel = src0_sel; 729 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 730 alu.src[0].value = src0_chan_val; 731 else 732 alu.src[0].chan = src0_chan_val; 733 alu.src[1].sel = src1_sel; 734 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 735 alu.src[1].value = src1_chan_val; 736 else 737 alu.src[1].chan = src1_chan_val; 738 alu.dst.sel = dst_sel; 739 alu.dst.chan = i; 740 alu.dst.write = i == dst_chan; 741 alu.last = (i == 3); 742 r = r600_bytecode_add_alu(ctx->bc, &alu); 743 if (r) 744 return r; 745 } 746 return 0; 747 } 748 749 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 750 alu.op = op; 751 alu.src[0].sel = src0_sel; 752 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 753 alu.src[0].value = src0_chan_val; 754 else 755 alu.src[0].chan = src0_chan_val; 756 alu.src[1].sel = src1_sel; 757 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 758 alu.src[1].value = src1_chan_val; 759 else 760 alu.src[1].chan = src1_chan_val; 761 alu.dst.sel = dst_sel; 762 alu.dst.chan = dst_chan; 763 alu.dst.write = 1; 764 alu.last = 1; 765 r = r600_bytecode_add_alu(ctx->bc, &alu); 766 if (r) 767 return r; 768 return 0; 769 } 770 771 /* execute a single slot ALU calculation */ 772 static int single_alu_op3(struct r600_shader_ctx *ctx, int op, 773 int dst_sel, int dst_chan, 774 int src0_sel, unsigned src0_chan_val, 775 int src1_sel, unsigned src1_chan_val, 776 int src2_sel, unsigned src2_chan_val) 777 { 778 struct r600_bytecode_alu alu; 779 int r; 780 781 /* validate this for other ops */ 782 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT); 783 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 784 alu.op = op; 785 alu.src[0].sel = src0_sel; 786 if (src0_sel == V_SQ_ALU_SRC_LITERAL) 787 alu.src[0].value = src0_chan_val; 788 else 789 alu.src[0].chan = src0_chan_val; 790 alu.src[1].sel = src1_sel; 791 if (src1_sel == V_SQ_ALU_SRC_LITERAL) 792 alu.src[1].value = src1_chan_val; 793 else 794 alu.src[1].chan = src1_chan_val; 795 alu.src[2].sel = src2_sel; 796 if (src2_sel == V_SQ_ALU_SRC_LITERAL) 797 alu.src[2].value = src2_chan_val; 798 else 799 alu.src[2].chan = src2_chan_val; 800 alu.dst.sel = dst_sel; 801 alu.dst.chan = dst_chan; 802 alu.is_op3 = 1; 803 alu.last = 1; 804 r = r600_bytecode_add_alu(ctx->bc, &alu); 805 if (r) 806 return r; 807 return 0; 808 } 809 810 /* put it in temp_reg.x */ 811 static int get_lds_offset0(struct r600_shader_ctx *ctx, 812 int rel_patch_chan, 813 int temp_reg, bool is_patch_var) 814 { 815 int r; 816 817 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */ 818 /* ADD 819 Dimension - patch0_offset (input_vals.z), 820 Non-dim - patch0_data_offset (input_vals.w) 821 */ 822 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 823 temp_reg, 0, 824 ctx->tess_output_info, 0, 825 0, rel_patch_chan, 826 ctx->tess_output_info, is_patch_var ? 3 : 2); 827 if (r) 828 return r; 829 return 0; 830 } 831 832 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index) 833 { 834 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg; 835 } 836 837 static int r600_get_temp(struct r600_shader_ctx *ctx) 838 { 839 return ctx->temp_reg + ctx->max_driver_temp_used++; 840 } 841 842 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) 843 { 844 int i; 845 i = ctx->shader->noutput++; 846 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID; 847 ctx->shader->output[i].sid = 0; 848 ctx->shader->output[i].gpr = 0; 849 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT; 850 ctx->shader->output[i].write_mask = 0x4; 851 ctx->shader->output[i].spi_sid = prim_id_sid; 852 853 return 0; 854 } 855 856 static int tgsi_barrier(struct r600_shader_ctx *ctx) 857 { 858 struct r600_bytecode_alu alu; 859 int r; 860 861 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 862 alu.op = ctx->inst_info->op; 863 alu.last = 1; 864 865 r = r600_bytecode_add_alu(ctx->bc, &alu); 866 if (r) 867 return r; 868 return 0; 869 } 870 871 static int tgsi_declaration(struct r600_shader_ctx *ctx) 872 { 873 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; 874 int r, i, j, count = d->Range.Last - d->Range.First + 1; 875 876 switch (d->Declaration.File) { 877 case TGSI_FILE_INPUT: 878 for (j = 0; j < count; j++) { 879 i = ctx->shader->ninput + j; 880 assert(i < ARRAY_SIZE(ctx->shader->input)); 881 ctx->shader->input[i].name = d->Semantic.Name; 882 ctx->shader->input[i].sid = d->Semantic.Index + j; 883 ctx->shader->input[i].interpolate = d->Interp.Interpolate; 884 ctx->shader->input[i].interpolate_location = d->Interp.Location; 885 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j; 886 if (ctx->type == PIPE_SHADER_FRAGMENT) { 887 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); 888 switch (ctx->shader->input[i].name) { 889 case TGSI_SEMANTIC_FACE: 890 if (ctx->face_gpr != -1) 891 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */ 892 else 893 ctx->face_gpr = ctx->shader->input[i].gpr; 894 break; 895 case TGSI_SEMANTIC_COLOR: 896 ctx->colors_used++; 897 break; 898 case TGSI_SEMANTIC_POSITION: 899 ctx->fragcoord_input = i; 900 break; 901 case TGSI_SEMANTIC_PRIMID: 902 /* set this for now */ 903 ctx->shader->gs_prim_id_input = true; 904 ctx->shader->ps_prim_id_input = i; 905 break; 906 } 907 if (ctx->bc->chip_class >= EVERGREEN) { 908 if ((r = evergreen_interp_input(ctx, i))) 909 return r; 910 } 911 } else if (ctx->type == PIPE_SHADER_GEOMETRY) { 912 /* FIXME probably skip inputs if they aren't passed in the ring */ 913 ctx->shader->input[i].ring_offset = ctx->next_ring_offset; 914 ctx->next_ring_offset += 16; 915 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID) 916 ctx->shader->gs_prim_id_input = true; 917 } 918 } 919 ctx->shader->ninput += count; 920 break; 921 case TGSI_FILE_OUTPUT: 922 for (j = 0; j < count; j++) { 923 i = ctx->shader->noutput + j; 924 assert(i < ARRAY_SIZE(ctx->shader->output)); 925 ctx->shader->output[i].name = d->Semantic.Name; 926 ctx->shader->output[i].sid = d->Semantic.Index + j; 927 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j; 928 ctx->shader->output[i].interpolate = d->Interp.Interpolate; 929 ctx->shader->output[i].write_mask = d->Declaration.UsageMask; 930 if (ctx->type == PIPE_SHADER_VERTEX || 931 ctx->type == PIPE_SHADER_GEOMETRY || 932 ctx->type == PIPE_SHADER_TESS_EVAL) { 933 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); 934 switch (d->Semantic.Name) { 935 case TGSI_SEMANTIC_CLIPDIST: 936 break; 937 case TGSI_SEMANTIC_PSIZE: 938 ctx->shader->vs_out_misc_write = 1; 939 ctx->shader->vs_out_point_size = 1; 940 break; 941 case TGSI_SEMANTIC_EDGEFLAG: 942 ctx->shader->vs_out_misc_write = 1; 943 ctx->shader->vs_out_edgeflag = 1; 944 ctx->edgeflag_output = i; 945 break; 946 case TGSI_SEMANTIC_VIEWPORT_INDEX: 947 ctx->shader->vs_out_misc_write = 1; 948 ctx->shader->vs_out_viewport = 1; 949 break; 950 case TGSI_SEMANTIC_LAYER: 951 ctx->shader->vs_out_misc_write = 1; 952 ctx->shader->vs_out_layer = 1; 953 break; 954 case TGSI_SEMANTIC_CLIPVERTEX: 955 ctx->clip_vertex_write = TRUE; 956 ctx->cv_output = i; 957 break; 958 } 959 if (ctx->type == PIPE_SHADER_GEOMETRY) { 960 ctx->gs_out_ring_offset += 16; 961 } 962 } else if (ctx->type == PIPE_SHADER_FRAGMENT) { 963 switch (d->Semantic.Name) { 964 case TGSI_SEMANTIC_COLOR: 965 ctx->shader->nr_ps_max_color_exports++; 966 break; 967 } 968 } 969 } 970 ctx->shader->noutput += count; 971 break; 972 case TGSI_FILE_TEMPORARY: 973 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) { 974 if (d->Array.ArrayID) { 975 r600_add_gpr_array(ctx->shader, 976 ctx->file_offset[TGSI_FILE_TEMPORARY] + 977 d->Range.First, 978 d->Range.Last - d->Range.First + 1, 0x0F); 979 } 980 } 981 break; 982 983 case TGSI_FILE_CONSTANT: 984 case TGSI_FILE_SAMPLER: 985 case TGSI_FILE_SAMPLER_VIEW: 986 case TGSI_FILE_ADDRESS: 987 case TGSI_FILE_BUFFER: 988 case TGSI_FILE_IMAGE: 989 case TGSI_FILE_MEMORY: 990 break; 991 992 case TGSI_FILE_HW_ATOMIC: 993 i = ctx->shader->nhwatomic_ranges; 994 ctx->shader->atomics[i].start = d->Range.First; 995 ctx->shader->atomics[i].end = d->Range.Last; 996 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic; 997 ctx->shader->atomics[i].array_id = d->Array.ArrayID; 998 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D; 999 ctx->shader->nhwatomic_ranges++; 1000 ctx->shader->nhwatomic += count; 1001 break; 1002 1003 case TGSI_FILE_SYSTEM_VALUE: 1004 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK || 1005 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID || 1006 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) { 1007 break; /* Already handled from allocate_system_value_inputs */ 1008 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { 1009 break; 1010 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) 1011 break; 1012 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID) 1013 break; 1014 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER || 1015 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) { 1016 int param = r600_get_lds_unique_index(d->Semantic.Name, 0); 1017 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2; 1018 unsigned temp_reg = r600_get_temp(ctx); 1019 1020 r = get_lds_offset0(ctx, 2, temp_reg, true); 1021 if (r) 1022 return r; 1023 1024 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1025 temp_reg, 0, 1026 temp_reg, 0, 1027 V_SQ_ALU_SRC_LITERAL, param * 16); 1028 if (r) 1029 return r; 1030 1031 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf); 1032 } 1033 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) { 1034 /* MOV r1.x, r0.x; 1035 MOV r1.y, r0.y; 1036 */ 1037 for (i = 0; i < 2; i++) { 1038 struct r600_bytecode_alu alu; 1039 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1040 alu.op = ALU_OP1_MOV; 1041 alu.src[0].sel = 0; 1042 alu.src[0].chan = 0 + i; 1043 alu.dst.sel = 1; 1044 alu.dst.chan = 0 + i; 1045 alu.dst.write = 1; 1046 alu.last = (i == 1) ? 1 : 0; 1047 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1048 return r; 1049 } 1050 /* ADD r1.z, 1.0f, -r0.x */ 1051 struct r600_bytecode_alu alu; 1052 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1053 alu.op = ALU_OP2_ADD; 1054 alu.src[0].sel = V_SQ_ALU_SRC_1; 1055 alu.src[1].sel = 1; 1056 alu.src[1].chan = 0; 1057 alu.src[1].neg = 1; 1058 alu.dst.sel = 1; 1059 alu.dst.chan = 2; 1060 alu.dst.write = 1; 1061 alu.last = 1; 1062 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1063 return r; 1064 1065 /* ADD r1.z, r1.z, -r1.y */ 1066 alu.op = ALU_OP2_ADD; 1067 alu.src[0].sel = 1; 1068 alu.src[0].chan = 2; 1069 alu.src[1].sel = 1; 1070 alu.src[1].chan = 1; 1071 alu.src[1].neg = 1; 1072 alu.dst.sel = 1; 1073 alu.dst.chan = 2; 1074 alu.dst.write = 1; 1075 alu.last = 1; 1076 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1077 return r; 1078 break; 1079 } 1080 break; 1081 default: 1082 R600_ERR("unsupported file %d declaration\n", d->Declaration.File); 1083 return -EINVAL; 1084 } 1085 return 0; 1086 } 1087 1088 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset) 1089 { 1090 struct tgsi_parse_context parse; 1091 struct { 1092 boolean enabled; 1093 int *reg; 1094 unsigned name, alternate_name; 1095 } inputs[2] = { 1096 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */ 1097 1098 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */ 1099 }; 1100 int num_regs = 0; 1101 unsigned k, i; 1102 1103 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { 1104 return 0; 1105 } 1106 1107 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ 1108 while (!tgsi_parse_end_of_tokens(&parse)) { 1109 tgsi_parse_token(&parse); 1110 1111 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { 1112 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; 1113 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || 1114 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 1115 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) 1116 { 1117 int interpolate, location, k; 1118 1119 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 1120 location = TGSI_INTERPOLATE_LOC_CENTER; 1121 inputs[1].enabled = true; /* needs SAMPLEID */ 1122 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 1123 location = TGSI_INTERPOLATE_LOC_CENTER; 1124 /* Needs sample positions, currently those are always available */ 1125 } else { 1126 location = TGSI_INTERPOLATE_LOC_CENTROID; 1127 } 1128 1129 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index]; 1130 k = eg_get_interpolator_index(interpolate, location); 1131 if (k >= 0) 1132 ctx->eg_interpolators[k].enabled = true; 1133 } 1134 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) { 1135 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration; 1136 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) { 1137 for (k = 0; k < ARRAY_SIZE(inputs); k++) { 1138 if (d->Semantic.Name == inputs[k].name || 1139 d->Semantic.Name == inputs[k].alternate_name) { 1140 inputs[k].enabled = true; 1141 } 1142 } 1143 } 1144 } 1145 } 1146 1147 tgsi_parse_free(&parse); 1148 1149 for (i = 0; i < ARRAY_SIZE(inputs); i++) { 1150 boolean enabled = inputs[i].enabled; 1151 int *reg = inputs[i].reg; 1152 unsigned name = inputs[i].name; 1153 1154 if (enabled) { 1155 int gpr = gpr_offset + num_regs++; 1156 ctx->shader->nsys_inputs++; 1157 1158 // add to inputs, allocate a gpr 1159 k = ctx->shader->ninput++; 1160 ctx->shader->input[k].name = name; 1161 ctx->shader->input[k].sid = 0; 1162 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT; 1163 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER; 1164 *reg = ctx->shader->input[k].gpr = gpr; 1165 } 1166 } 1167 1168 return gpr_offset + num_regs; 1169 } 1170 1171 /* 1172 * for evergreen we need to scan the shader to find the number of GPRs we need to 1173 * reserve for interpolation and system values 1174 * 1175 * we need to know if we are going to emit 1176 * any sample or centroid inputs 1177 * if perspective and linear are required 1178 */ 1179 static int evergreen_gpr_count(struct r600_shader_ctx *ctx) 1180 { 1181 unsigned i; 1182 int num_baryc; 1183 struct tgsi_parse_context parse; 1184 1185 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators)); 1186 1187 for (i = 0; i < ctx->info.num_inputs; i++) { 1188 int k; 1189 /* skip position/face/mask/sampleid */ 1190 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION || 1191 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE || 1192 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK || 1193 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID) 1194 continue; 1195 1196 k = eg_get_interpolator_index( 1197 ctx->info.input_interpolate[i], 1198 ctx->info.input_interpolate_loc[i]); 1199 if (k >= 0) 1200 ctx->eg_interpolators[k].enabled = TRUE; 1201 } 1202 1203 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) { 1204 return 0; 1205 } 1206 1207 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */ 1208 while (!tgsi_parse_end_of_tokens(&parse)) { 1209 tgsi_parse_token(&parse); 1210 1211 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) { 1212 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction; 1213 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE || 1214 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 1215 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID) 1216 { 1217 int interpolate, location, k; 1218 1219 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 1220 location = TGSI_INTERPOLATE_LOC_CENTER; 1221 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) { 1222 location = TGSI_INTERPOLATE_LOC_CENTER; 1223 } else { 1224 location = TGSI_INTERPOLATE_LOC_CENTROID; 1225 } 1226 1227 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index]; 1228 k = eg_get_interpolator_index(interpolate, location); 1229 if (k >= 0) 1230 ctx->eg_interpolators[k].enabled = true; 1231 } 1232 } 1233 } 1234 1235 tgsi_parse_free(&parse); 1236 1237 /* assign gpr to each interpolator according to priority */ 1238 num_baryc = 0; 1239 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { 1240 if (ctx->eg_interpolators[i].enabled) { 1241 ctx->eg_interpolators[i].ij_index = num_baryc; 1242 num_baryc ++; 1243 } 1244 } 1245 1246 /* XXX PULL MODEL and LINE STIPPLE */ 1247 1248 num_baryc = (num_baryc + 1) >> 1; 1249 return allocate_system_value_inputs(ctx, num_baryc); 1250 } 1251 1252 /* sample_id_sel == NULL means fetch for current sample */ 1253 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel) 1254 { 1255 struct r600_bytecode_vtx vtx; 1256 int r, t1; 1257 1258 assert(ctx->fixed_pt_position_gpr != -1); 1259 1260 t1 = r600_get_temp(ctx); 1261 1262 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1263 vtx.op = FETCH_OP_VFETCH; 1264 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1265 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1266 if (sample_id == NULL) { 1267 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w; 1268 vtx.src_sel_x = 3; 1269 } 1270 else { 1271 struct r600_bytecode_alu alu; 1272 1273 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1274 alu.op = ALU_OP1_MOV; 1275 r600_bytecode_src(&alu.src[0], sample_id, chan_sel); 1276 alu.dst.sel = t1; 1277 alu.dst.write = 1; 1278 alu.last = 1; 1279 r = r600_bytecode_add_alu(ctx->bc, &alu); 1280 if (r) 1281 return r; 1282 1283 vtx.src_gpr = t1; 1284 vtx.src_sel_x = 0; 1285 } 1286 vtx.mega_fetch_count = 16; 1287 vtx.dst_gpr = t1; 1288 vtx.dst_sel_x = 0; 1289 vtx.dst_sel_y = 1; 1290 vtx.dst_sel_z = 2; 1291 vtx.dst_sel_w = 3; 1292 vtx.data_format = FMT_32_32_32_32_FLOAT; 1293 vtx.num_format_all = 2; 1294 vtx.format_comp_all = 1; 1295 vtx.use_const_fields = 0; 1296 vtx.offset = 0; 1297 vtx.endian = r600_endian_swap(32); 1298 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1299 1300 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1301 if (r) 1302 return r; 1303 1304 return t1; 1305 } 1306 1307 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block) 1308 { 1309 struct r600_bytecode_vtx vtx; 1310 int r, t1; 1311 1312 if (ctx->cs_block_size_loaded) 1313 return ctx->cs_block_size_reg; 1314 if (ctx->cs_grid_size_loaded) 1315 return ctx->cs_grid_size_reg; 1316 1317 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg; 1318 struct r600_bytecode_alu alu; 1319 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1320 alu.op = ALU_OP1_MOV; 1321 alu.src[0].sel = V_SQ_ALU_SRC_0; 1322 alu.dst.sel = t1; 1323 alu.dst.write = 1; 1324 alu.last = 1; 1325 r = r600_bytecode_add_alu(ctx->bc, &alu); 1326 if (r) 1327 return r; 1328 1329 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 1330 vtx.op = FETCH_OP_VFETCH; 1331 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; 1332 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1333 vtx.src_gpr = t1; 1334 vtx.src_sel_x = 0; 1335 1336 vtx.mega_fetch_count = 16; 1337 vtx.dst_gpr = t1; 1338 vtx.dst_sel_x = 0; 1339 vtx.dst_sel_y = 1; 1340 vtx.dst_sel_z = 2; 1341 vtx.dst_sel_w = 7; 1342 vtx.data_format = FMT_32_32_32_32; 1343 vtx.num_format_all = 1; 1344 vtx.format_comp_all = 0; 1345 vtx.use_const_fields = 0; 1346 vtx.offset = load_block ? 0 : 16; // first element is size of buffer 1347 vtx.endian = r600_endian_swap(32); 1348 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */ 1349 1350 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 1351 if (r) 1352 return r; 1353 1354 if (load_block) 1355 ctx->cs_block_size_loaded = true; 1356 else 1357 ctx->cs_grid_size_loaded = true; 1358 return t1; 1359 } 1360 1361 static void tgsi_src(struct r600_shader_ctx *ctx, 1362 const struct tgsi_full_src_register *tgsi_src, 1363 struct r600_shader_src *r600_src) 1364 { 1365 memset(r600_src, 0, sizeof(*r600_src)); 1366 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX; 1367 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY; 1368 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ; 1369 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; 1370 r600_src->neg = tgsi_src->Register.Negate; 1371 r600_src->abs = tgsi_src->Register.Absolute; 1372 1373 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { 1374 int index; 1375 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && 1376 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) && 1377 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { 1378 1379 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; 1380 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs); 1381 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) 1382 return; 1383 } 1384 index = tgsi_src->Register.Index; 1385 r600_src->sel = V_SQ_ALU_SRC_LITERAL; 1386 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); 1387 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { 1388 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) { 1389 r600_src->swizzle[0] = 2; // Z value 1390 r600_src->swizzle[1] = 2; 1391 r600_src->swizzle[2] = 2; 1392 r600_src->swizzle[3] = 2; 1393 r600_src->sel = ctx->face_gpr; 1394 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) { 1395 r600_src->swizzle[0] = 3; // W value 1396 r600_src->swizzle[1] = 3; 1397 r600_src->swizzle[2] = 3; 1398 r600_src->swizzle[3] = 3; 1399 r600_src->sel = ctx->fixed_pt_position_gpr; 1400 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) { 1401 r600_src->swizzle[0] = 0; 1402 r600_src->swizzle[1] = 1; 1403 r600_src->swizzle[2] = 4; 1404 r600_src->swizzle[3] = 4; 1405 r600_src->sel = load_sample_position(ctx, NULL, -1); 1406 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) { 1407 r600_src->swizzle[0] = 3; 1408 r600_src->swizzle[1] = 3; 1409 r600_src->swizzle[2] = 3; 1410 r600_src->swizzle[3] = 3; 1411 r600_src->sel = 0; 1412 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) { 1413 r600_src->swizzle[0] = 0; 1414 r600_src->swizzle[1] = 0; 1415 r600_src->swizzle[2] = 0; 1416 r600_src->swizzle[3] = 0; 1417 r600_src->sel = 0; 1418 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) { 1419 r600_src->sel = 0; 1420 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) { 1421 r600_src->sel = 1; 1422 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1423 r600_src->swizzle[0] = 3; 1424 r600_src->swizzle[1] = 3; 1425 r600_src->swizzle[2] = 3; 1426 r600_src->swizzle[3] = 3; 1427 r600_src->sel = 1; 1428 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { 1429 r600_src->swizzle[0] = 2; 1430 r600_src->swizzle[1] = 2; 1431 r600_src->swizzle[2] = 2; 1432 r600_src->swizzle[3] = 2; 1433 r600_src->sel = 0; 1434 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) { 1435 r600_src->sel = 1; 1436 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) { 1437 r600_src->sel = 3; 1438 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) { 1439 r600_src->sel = 2; 1440 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) { 1441 if (ctx->type == PIPE_SHADER_TESS_CTRL) { 1442 r600_src->sel = ctx->tess_input_info; 1443 r600_src->swizzle[0] = 2; 1444 r600_src->swizzle[1] = 2; 1445 r600_src->swizzle[2] = 2; 1446 r600_src->swizzle[3] = 2; 1447 } else { 1448 r600_src->sel = ctx->tess_input_info; 1449 r600_src->swizzle[0] = 3; 1450 r600_src->swizzle[1] = 3; 1451 r600_src->swizzle[2] = 3; 1452 r600_src->swizzle[3] = 3; 1453 } 1454 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1455 r600_src->sel = 0; 1456 r600_src->swizzle[0] = 0; 1457 r600_src->swizzle[1] = 0; 1458 r600_src->swizzle[2] = 0; 1459 r600_src->swizzle[3] = 0; 1460 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { 1461 r600_src->sel = 0; 1462 r600_src->swizzle[0] = 3; 1463 r600_src->swizzle[1] = 3; 1464 r600_src->swizzle[2] = 3; 1465 r600_src->swizzle[3] = 3; 1466 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) { 1467 r600_src->sel = load_block_grid_size(ctx, false); 1468 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) { 1469 r600_src->sel = load_block_grid_size(ctx, true); 1470 } 1471 } else { 1472 if (tgsi_src->Register.Indirect) 1473 r600_src->rel = V_SQ_REL_RELATIVE; 1474 r600_src->sel = tgsi_src->Register.Index; 1475 r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; 1476 } 1477 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) { 1478 if (tgsi_src->Register.Dimension) { 1479 r600_src->kc_bank = tgsi_src->Dimension.Index; 1480 if (tgsi_src->Dimension.Indirect) { 1481 r600_src->kc_rel = 1; 1482 } 1483 } 1484 } 1485 } 1486 1487 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, 1488 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan, 1489 unsigned int dst_reg) 1490 { 1491 struct r600_bytecode_vtx vtx; 1492 unsigned int ar_reg; 1493 int r; 1494 1495 if (offset) { 1496 struct r600_bytecode_alu alu; 1497 1498 memset(&alu, 0, sizeof(alu)); 1499 1500 alu.op = ALU_OP2_ADD_INT; 1501 alu.src[0].sel = ctx->bc->ar_reg; 1502 alu.src[0].chan = ar_chan; 1503 1504 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 1505 alu.src[1].value = offset; 1506 1507 alu.dst.sel = dst_reg; 1508 alu.dst.chan = ar_chan; 1509 alu.dst.write = 1; 1510 alu.last = 1; 1511 1512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 1513 return r; 1514 1515 ar_reg = dst_reg; 1516 } else { 1517 ar_reg = ctx->bc->ar_reg; 1518 } 1519 1520 memset(&vtx, 0, sizeof(vtx)); 1521 vtx.buffer_id = cb_idx; 1522 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1523 vtx.src_gpr = ar_reg; 1524 vtx.src_sel_x = ar_chan; 1525 vtx.mega_fetch_count = 16; 1526 vtx.dst_gpr = dst_reg; 1527 vtx.dst_sel_x = 0; /* SEL_X */ 1528 vtx.dst_sel_y = 1; /* SEL_Y */ 1529 vtx.dst_sel_z = 2; /* SEL_Z */ 1530 vtx.dst_sel_w = 3; /* SEL_W */ 1531 vtx.data_format = FMT_32_32_32_32_FLOAT; 1532 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */ 1533 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */ 1534 vtx.endian = r600_endian_swap(32); 1535 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE; 1536 1537 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1538 return r; 1539 1540 return 0; 1541 } 1542 1543 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1544 { 1545 struct r600_bytecode_vtx vtx; 1546 int r; 1547 unsigned index = src->Register.Index; 1548 unsigned vtx_id = src->Dimension.Index; 1549 int offset_reg = ctx->gs_rotated_input[vtx_id / 3]; 1550 int offset_chan = vtx_id % 3; 1551 int t2 = 0; 1552 1553 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, 1554 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ 1555 1556 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2) 1557 offset_chan = 3; 1558 1559 if (src->Dimension.Indirect || src->Register.Indirect) 1560 t2 = r600_get_temp(ctx); 1561 1562 if (src->Dimension.Indirect) { 1563 int treg[3]; 1564 struct r600_bytecode_alu alu; 1565 int r, i; 1566 unsigned addr_reg; 1567 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index); 1568 if (src->DimIndirect.Index > 0) { 1569 r = single_alu_op2(ctx, ALU_OP1_MOV, 1570 ctx->bc->ar_reg, 0, 1571 addr_reg, 0, 1572 0, 0); 1573 if (r) 1574 return r; 1575 } 1576 /* 1577 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt. 1578 at least this is what fglrx seems to do. */ 1579 for (i = 0; i < 3; i++) { 1580 treg[i] = r600_get_temp(ctx); 1581 } 1582 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F); 1583 1584 for (i = 0; i < 3; i++) { 1585 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1586 alu.op = ALU_OP1_MOV; 1587 alu.src[0].sel = ctx->gs_rotated_input[0]; 1588 alu.src[0].chan = i == 2 ? 3 : i; 1589 alu.dst.sel = treg[i]; 1590 alu.dst.chan = 0; 1591 alu.dst.write = 1; 1592 alu.last = 1; 1593 r = r600_bytecode_add_alu(ctx->bc, &alu); 1594 if (r) 1595 return r; 1596 } 1597 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 1598 alu.op = ALU_OP1_MOV; 1599 alu.src[0].sel = treg[0]; 1600 alu.src[0].rel = 1; 1601 alu.dst.sel = t2; 1602 alu.dst.write = 1; 1603 alu.last = 1; 1604 r = r600_bytecode_add_alu(ctx->bc, &alu); 1605 if (r) 1606 return r; 1607 offset_reg = t2; 1608 offset_chan = 0; 1609 } 1610 1611 if (src->Register.Indirect) { 1612 int addr_reg; 1613 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID]; 1614 1615 addr_reg = get_address_file_reg(ctx, src->Indirect.Index); 1616 1617 /* pull the value from index_reg */ 1618 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1619 t2, 1, 1620 addr_reg, 0, 1621 V_SQ_ALU_SRC_LITERAL, first); 1622 if (r) 1623 return r; 1624 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1625 t2, 0, 1626 t2, 1, 1627 V_SQ_ALU_SRC_LITERAL, 4, 1628 offset_reg, offset_chan); 1629 if (r) 1630 return r; 1631 offset_reg = t2; 1632 offset_chan = 0; 1633 index = src->Register.Index - first; 1634 } 1635 1636 memset(&vtx, 0, sizeof(vtx)); 1637 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 1638 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 1639 vtx.src_gpr = offset_reg; 1640 vtx.src_sel_x = offset_chan; 1641 vtx.offset = index * 16; /*bytes*/ 1642 vtx.mega_fetch_count = 16; 1643 vtx.dst_gpr = dst_reg; 1644 vtx.dst_sel_x = 0; /* SEL_X */ 1645 vtx.dst_sel_y = 1; /* SEL_Y */ 1646 vtx.dst_sel_z = 2; /* SEL_Z */ 1647 vtx.dst_sel_w = 3; /* SEL_W */ 1648 if (ctx->bc->chip_class >= EVERGREEN) { 1649 vtx.use_const_fields = 1; 1650 } else { 1651 vtx.data_format = FMT_32_32_32_32_FLOAT; 1652 } 1653 1654 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 1655 return r; 1656 1657 return 0; 1658 } 1659 1660 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) 1661 { 1662 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1663 unsigned i; 1664 1665 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1666 struct tgsi_full_src_register *src = &inst->Src[i]; 1667 1668 if (src->Register.File == TGSI_FILE_INPUT) { 1669 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) { 1670 /* primitive id is in R0.z */ 1671 ctx->src[i].sel = 0; 1672 ctx->src[i].swizzle[0] = 2; 1673 } 1674 } 1675 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) { 1676 int treg = r600_get_temp(ctx); 1677 1678 fetch_gs_input(ctx, src, treg); 1679 ctx->src[i].sel = treg; 1680 ctx->src[i].rel = 0; 1681 } 1682 } 1683 return 0; 1684 } 1685 1686 1687 /* Tessellation shaders pass outputs to the next shader using LDS. 1688 * 1689 * LS outputs = TCS(HS) inputs 1690 * TCS(HS) outputs = TES(DS) inputs 1691 * 1692 * The LDS layout is: 1693 * - TCS inputs for patch 0 1694 * - TCS inputs for patch 1 1695 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) 1696 * - ... 1697 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset 1698 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset 1699 * - TCS outputs for patch 1 1700 * - Per-patch TCS outputs for patch 1 1701 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) 1702 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) 1703 * - ... 1704 * 1705 * All three shaders VS(LS), TCS, TES share the same LDS space. 1706 */ 1707 /* this will return with the dw address in temp_reg.x */ 1708 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, 1709 const struct tgsi_full_dst_register *dst, 1710 const struct tgsi_full_src_register *src, 1711 int stride_bytes_reg, int stride_bytes_chan) 1712 { 1713 struct tgsi_full_dst_register reg; 1714 ubyte *name, *index, *array_first; 1715 int r; 1716 int param; 1717 struct tgsi_shader_info *info = &ctx->info; 1718 /* Set the register description. The address computation is the same 1719 * for sources and destinations. */ 1720 if (src) { 1721 reg.Register.File = src->Register.File; 1722 reg.Register.Index = src->Register.Index; 1723 reg.Register.Indirect = src->Register.Indirect; 1724 reg.Register.Dimension = src->Register.Dimension; 1725 reg.Indirect = src->Indirect; 1726 reg.Dimension = src->Dimension; 1727 reg.DimIndirect = src->DimIndirect; 1728 } else 1729 reg = *dst; 1730 1731 /* If the register is 2-dimensional (e.g. an array of vertices 1732 * in a primitive), calculate the base address of the vertex. */ 1733 if (reg.Register.Dimension) { 1734 int sel, chan; 1735 if (reg.Dimension.Indirect) { 1736 unsigned addr_reg; 1737 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS); 1738 1739 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index); 1740 /* pull the value from index_reg */ 1741 sel = addr_reg; 1742 chan = 0; 1743 } else { 1744 sel = V_SQ_ALU_SRC_LITERAL; 1745 chan = reg.Dimension.Index; 1746 } 1747 1748 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1749 temp_reg, 0, 1750 stride_bytes_reg, stride_bytes_chan, 1751 sel, chan, 1752 temp_reg, 0); 1753 if (r) 1754 return r; 1755 } 1756 1757 if (reg.Register.File == TGSI_FILE_INPUT) { 1758 name = info->input_semantic_name; 1759 index = info->input_semantic_index; 1760 array_first = info->input_array_first; 1761 } else if (reg.Register.File == TGSI_FILE_OUTPUT) { 1762 name = info->output_semantic_name; 1763 index = info->output_semantic_index; 1764 array_first = info->output_array_first; 1765 } else { 1766 assert(0); 1767 return -1; 1768 } 1769 if (reg.Register.Indirect) { 1770 int addr_reg; 1771 int first; 1772 /* Add the relative address of the element. */ 1773 if (reg.Indirect.ArrayID) 1774 first = array_first[reg.Indirect.ArrayID]; 1775 else 1776 first = reg.Register.Index; 1777 1778 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index); 1779 1780 /* pull the value from index_reg */ 1781 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 1782 temp_reg, 0, 1783 V_SQ_ALU_SRC_LITERAL, 16, 1784 addr_reg, 0, 1785 temp_reg, 0); 1786 if (r) 1787 return r; 1788 1789 param = r600_get_lds_unique_index(name[first], 1790 index[first]); 1791 1792 } else { 1793 param = r600_get_lds_unique_index(name[reg.Register.Index], 1794 index[reg.Register.Index]); 1795 } 1796 1797 /* add to base_addr - passed in temp_reg.x */ 1798 if (param) { 1799 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1800 temp_reg, 0, 1801 temp_reg, 0, 1802 V_SQ_ALU_SRC_LITERAL, param * 16); 1803 if (r) 1804 return r; 1805 1806 } 1807 return 0; 1808 } 1809 1810 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, 1811 unsigned dst_reg, unsigned mask) 1812 { 1813 struct r600_bytecode_alu alu; 1814 int r, i, lasti; 1815 1816 if ((ctx->bc->cf_last->ndw>>1) >= 0x60) 1817 ctx->bc->force_add_cf = 1; 1818 1819 lasti = tgsi_last_instruction(mask); 1820 for (i = 1; i <= lasti; i++) { 1821 if (!(mask & (1 << i))) 1822 continue; 1823 1824 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 1825 temp_reg, i, 1826 temp_reg, 0, 1827 V_SQ_ALU_SRC_LITERAL, 4 * i); 1828 if (r) 1829 return r; 1830 } 1831 for (i = 0; i <= lasti; i++) { 1832 if (!(mask & (1 << i))) 1833 continue; 1834 1835 /* emit an LDS_READ_RET */ 1836 memset(&alu, 0, sizeof(alu)); 1837 alu.op = LDS_OP1_LDS_READ_RET; 1838 alu.src[0].sel = temp_reg; 1839 alu.src[0].chan = i; 1840 alu.src[1].sel = V_SQ_ALU_SRC_0; 1841 alu.src[2].sel = V_SQ_ALU_SRC_0; 1842 alu.dst.chan = 0; 1843 alu.is_lds_idx_op = true; 1844 alu.last = 1; 1845 r = r600_bytecode_add_alu(ctx->bc, &alu); 1846 if (r) 1847 return r; 1848 } 1849 for (i = 0; i <= lasti; i++) { 1850 if (!(mask & (1 << i))) 1851 continue; 1852 1853 /* then read from LDS_OQ_A_POP */ 1854 memset(&alu, 0, sizeof(alu)); 1855 1856 alu.op = ALU_OP1_MOV; 1857 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 1858 alu.src[0].chan = 0; 1859 alu.dst.sel = dst_reg; 1860 alu.dst.chan = i; 1861 alu.dst.write = 1; 1862 alu.last = 1; 1863 r = r600_bytecode_add_alu(ctx->bc, &alu); 1864 if (r) 1865 return r; 1866 } 1867 return 0; 1868 } 1869 1870 static int fetch_mask(struct tgsi_src_register *reg) 1871 { 1872 int mask = 0; 1873 mask |= 1 << reg->SwizzleX; 1874 mask |= 1 << reg->SwizzleY; 1875 mask |= 1 << reg->SwizzleZ; 1876 mask |= 1 << reg->SwizzleW; 1877 return mask; 1878 } 1879 1880 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1881 { 1882 int r; 1883 unsigned temp_reg = r600_get_temp(ctx); 1884 1885 r = get_lds_offset0(ctx, 2, temp_reg, 1886 src->Register.Dimension ? false : true); 1887 if (r) 1888 return r; 1889 1890 /* the base address is now in temp.x */ 1891 r = r600_get_byte_address(ctx, temp_reg, 1892 NULL, src, ctx->tess_output_info, 1); 1893 if (r) 1894 return r; 1895 1896 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 1897 if (r) 1898 return r; 1899 return 0; 1900 } 1901 1902 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1903 { 1904 int r; 1905 unsigned temp_reg = r600_get_temp(ctx); 1906 1907 /* t.x = ips * r0.y */ 1908 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 1909 temp_reg, 0, 1910 ctx->tess_input_info, 0, 1911 0, 1); 1912 1913 if (r) 1914 return r; 1915 1916 /* the base address is now in temp.x */ 1917 r = r600_get_byte_address(ctx, temp_reg, 1918 NULL, src, ctx->tess_input_info, 1); 1919 if (r) 1920 return r; 1921 1922 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 1923 if (r) 1924 return r; 1925 return 0; 1926 } 1927 1928 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) 1929 { 1930 int r; 1931 unsigned temp_reg = r600_get_temp(ctx); 1932 1933 r = get_lds_offset0(ctx, 1, temp_reg, 1934 src->Register.Dimension ? false : true); 1935 if (r) 1936 return r; 1937 /* the base address is now in temp.x */ 1938 r = r600_get_byte_address(ctx, temp_reg, 1939 NULL, src, 1940 ctx->tess_output_info, 1); 1941 if (r) 1942 return r; 1943 1944 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register)); 1945 if (r) 1946 return r; 1947 return 0; 1948 } 1949 1950 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx) 1951 { 1952 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1953 unsigned i; 1954 1955 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1956 struct tgsi_full_src_register *src = &inst->Src[i]; 1957 1958 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) { 1959 int treg = r600_get_temp(ctx); 1960 fetch_tes_input(ctx, src, treg); 1961 ctx->src[i].sel = treg; 1962 ctx->src[i].rel = 0; 1963 } 1964 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) { 1965 int treg = r600_get_temp(ctx); 1966 fetch_tcs_input(ctx, src, treg); 1967 ctx->src[i].sel = treg; 1968 ctx->src[i].rel = 0; 1969 } 1970 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) { 1971 int treg = r600_get_temp(ctx); 1972 fetch_tcs_output(ctx, src, treg); 1973 ctx->src[i].sel = treg; 1974 ctx->src[i].rel = 0; 1975 } 1976 } 1977 return 0; 1978 } 1979 1980 static int tgsi_split_constant(struct r600_shader_ctx *ctx) 1981 { 1982 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 1983 struct r600_bytecode_alu alu; 1984 int i, j, k, nconst, r; 1985 1986 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { 1987 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { 1988 nconst++; 1989 } 1990 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); 1991 } 1992 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { 1993 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { 1994 continue; 1995 } 1996 1997 if (ctx->src[i].rel) { 1998 int chan = inst->Src[i].Indirect.Swizzle; 1999 int treg = r600_get_temp(ctx); 2000 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg))) 2001 return r; 2002 2003 ctx->src[i].kc_bank = 0; 2004 ctx->src[i].kc_rel = 0; 2005 ctx->src[i].sel = treg; 2006 ctx->src[i].rel = 0; 2007 j--; 2008 } else if (j > 0) { 2009 int treg = r600_get_temp(ctx); 2010 for (k = 0; k < 4; k++) { 2011 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2012 alu.op = ALU_OP1_MOV; 2013 alu.src[0].sel = ctx->src[i].sel; 2014 alu.src[0].chan = k; 2015 alu.src[0].rel = ctx->src[i].rel; 2016 alu.src[0].kc_bank = ctx->src[i].kc_bank; 2017 alu.src[0].kc_rel = ctx->src[i].kc_rel; 2018 alu.dst.sel = treg; 2019 alu.dst.chan = k; 2020 alu.dst.write = 1; 2021 if (k == 3) 2022 alu.last = 1; 2023 r = r600_bytecode_add_alu(ctx->bc, &alu); 2024 if (r) 2025 return r; 2026 } 2027 ctx->src[i].sel = treg; 2028 ctx->src[i].rel =0; 2029 j--; 2030 } 2031 } 2032 return 0; 2033 } 2034 2035 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ 2036 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx) 2037 { 2038 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2039 struct r600_bytecode_alu alu; 2040 int i, j, k, nliteral, r; 2041 2042 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { 2043 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2044 nliteral++; 2045 } 2046 } 2047 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) { 2048 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 2049 int treg = r600_get_temp(ctx); 2050 for (k = 0; k < 4; k++) { 2051 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2052 alu.op = ALU_OP1_MOV; 2053 alu.src[0].sel = ctx->src[i].sel; 2054 alu.src[0].chan = k; 2055 alu.src[0].value = ctx->src[i].value[k]; 2056 alu.dst.sel = treg; 2057 alu.dst.chan = k; 2058 alu.dst.write = 1; 2059 if (k == 3) 2060 alu.last = 1; 2061 r = r600_bytecode_add_alu(ctx->bc, &alu); 2062 if (r) 2063 return r; 2064 } 2065 ctx->src[i].sel = treg; 2066 j--; 2067 } 2068 } 2069 return 0; 2070 } 2071 2072 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) 2073 { 2074 int i, r, count = ctx->shader->ninput; 2075 2076 for (i = 0; i < count; i++) { 2077 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) { 2078 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input); 2079 if (r) 2080 return r; 2081 } 2082 } 2083 return 0; 2084 } 2085 2086 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so, 2087 int stream, unsigned *stream_item_size UNUSED) 2088 { 2089 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; 2090 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS]; 2091 int j, r; 2092 unsigned i; 2093 2094 /* Sanity checking. */ 2095 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) { 2096 R600_ERR("Too many stream outputs: %d\n", so->num_outputs); 2097 r = -EINVAL; 2098 goto out_err; 2099 } 2100 for (i = 0; i < so->num_outputs; i++) { 2101 if (so->output[i].output_buffer >= 4) { 2102 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n", 2103 so->output[i].output_buffer); 2104 r = -EINVAL; 2105 goto out_err; 2106 } 2107 } 2108 2109 /* Initialize locations where the outputs are stored. */ 2110 for (i = 0; i < so->num_outputs; i++) { 2111 2112 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; 2113 start_comp[i] = so->output[i].start_component; 2114 /* Lower outputs with dst_offset < start_component. 2115 * 2116 * We can only output 4D vectors with a write mask, e.g. we can 2117 * only output the W component at offset 3, etc. If we want 2118 * to store Y, Z, or W at buffer offset 0, we need to use MOV 2119 * to move it to X and output X. */ 2120 if (so->output[i].dst_offset < so->output[i].start_component) { 2121 unsigned tmp = r600_get_temp(ctx); 2122 2123 for (j = 0; j < so->output[i].num_components; j++) { 2124 struct r600_bytecode_alu alu; 2125 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2126 alu.op = ALU_OP1_MOV; 2127 alu.src[0].sel = so_gpr[i]; 2128 alu.src[0].chan = so->output[i].start_component + j; 2129 2130 alu.dst.sel = tmp; 2131 alu.dst.chan = j; 2132 alu.dst.write = 1; 2133 if (j == so->output[i].num_components - 1) 2134 alu.last = 1; 2135 r = r600_bytecode_add_alu(ctx->bc, &alu); 2136 if (r) 2137 return r; 2138 } 2139 start_comp[i] = 0; 2140 so_gpr[i] = tmp; 2141 } 2142 } 2143 2144 /* Write outputs to buffers. */ 2145 for (i = 0; i < so->num_outputs; i++) { 2146 struct r600_bytecode_output output; 2147 2148 if (stream != -1 && stream != so->output[i].stream) 2149 continue; 2150 2151 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2152 output.gpr = so_gpr[i]; 2153 output.elem_size = so->output[i].num_components - 1; 2154 if (output.elem_size == 2) 2155 output.elem_size = 3; // 3 not supported, write 4 with junk at end 2156 output.array_base = so->output[i].dst_offset - start_comp[i]; 2157 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2158 output.burst_count = 1; 2159 /* array_size is an upper limit for the burst_count 2160 * with MEM_STREAM instructions */ 2161 output.array_size = 0xFFF; 2162 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i]; 2163 2164 if (ctx->bc->chip_class >= EVERGREEN) { 2165 switch (so->output[i].output_buffer) { 2166 case 0: 2167 output.op = CF_OP_MEM_STREAM0_BUF0; 2168 break; 2169 case 1: 2170 output.op = CF_OP_MEM_STREAM0_BUF1; 2171 break; 2172 case 2: 2173 output.op = CF_OP_MEM_STREAM0_BUF2; 2174 break; 2175 case 3: 2176 output.op = CF_OP_MEM_STREAM0_BUF3; 2177 break; 2178 } 2179 output.op += so->output[i].stream * 4; 2180 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3); 2181 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4; 2182 } else { 2183 switch (so->output[i].output_buffer) { 2184 case 0: 2185 output.op = CF_OP_MEM_STREAM0; 2186 break; 2187 case 1: 2188 output.op = CF_OP_MEM_STREAM1; 2189 break; 2190 case 2: 2191 output.op = CF_OP_MEM_STREAM2; 2192 break; 2193 case 3: 2194 output.op = CF_OP_MEM_STREAM3; 2195 break; 2196 } 2197 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer; 2198 } 2199 r = r600_bytecode_add_output(ctx->bc, &output); 2200 if (r) 2201 goto out_err; 2202 } 2203 return 0; 2204 out_err: 2205 return r; 2206 } 2207 2208 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx) 2209 { 2210 struct r600_bytecode_alu alu; 2211 unsigned reg; 2212 2213 if (!ctx->shader->vs_out_edgeflag) 2214 return; 2215 2216 reg = ctx->shader->output[ctx->edgeflag_output].gpr; 2217 2218 /* clamp(x, 0, 1) */ 2219 memset(&alu, 0, sizeof(alu)); 2220 alu.op = ALU_OP1_MOV; 2221 alu.src[0].sel = reg; 2222 alu.dst.sel = reg; 2223 alu.dst.write = 1; 2224 alu.dst.clamp = 1; 2225 alu.last = 1; 2226 r600_bytecode_add_alu(ctx->bc, &alu); 2227 2228 memset(&alu, 0, sizeof(alu)); 2229 alu.op = ALU_OP1_FLT_TO_INT; 2230 alu.src[0].sel = reg; 2231 alu.dst.sel = reg; 2232 alu.dst.write = 1; 2233 alu.last = 1; 2234 r600_bytecode_add_alu(ctx->bc, &alu); 2235 } 2236 2237 static int generate_gs_copy_shader(struct r600_context *rctx, 2238 struct r600_pipe_shader *gs, 2239 struct pipe_stream_output_info *so) 2240 { 2241 struct r600_shader_ctx ctx = {}; 2242 struct r600_shader *gs_shader = &gs->shader; 2243 struct r600_pipe_shader *cshader; 2244 unsigned ocnt = gs_shader->noutput; 2245 struct r600_bytecode_alu alu; 2246 struct r600_bytecode_vtx vtx; 2247 struct r600_bytecode_output output; 2248 struct r600_bytecode_cf *cf_jump, *cf_pop, 2249 *last_exp_pos = NULL, *last_exp_param = NULL; 2250 int next_clip_pos = 61, next_param = 0; 2251 unsigned i, j; 2252 int ring; 2253 bool only_ring_0 = true; 2254 cshader = calloc(1, sizeof(struct r600_pipe_shader)); 2255 if (!cshader) 2256 return 0; 2257 2258 memcpy(cshader->shader.output, gs_shader->output, ocnt * 2259 sizeof(struct r600_shader_io)); 2260 2261 cshader->shader.noutput = ocnt; 2262 2263 ctx.shader = &cshader->shader; 2264 ctx.bc = &ctx.shader->bc; 2265 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX; 2266 2267 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family, 2268 rctx->screen->has_compressed_msaa_texturing); 2269 2270 ctx.bc->isa = rctx->isa; 2271 2272 cf_jump = NULL; 2273 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes)); 2274 2275 /* R0.x = R0.x & 0x3fffffff */ 2276 memset(&alu, 0, sizeof(alu)); 2277 alu.op = ALU_OP2_AND_INT; 2278 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2279 alu.src[1].value = 0x3fffffff; 2280 alu.dst.write = 1; 2281 r600_bytecode_add_alu(ctx.bc, &alu); 2282 2283 /* R0.y = R0.x >> 30 */ 2284 memset(&alu, 0, sizeof(alu)); 2285 alu.op = ALU_OP2_LSHR_INT; 2286 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2287 alu.src[1].value = 0x1e; 2288 alu.dst.chan = 1; 2289 alu.dst.write = 1; 2290 alu.last = 1; 2291 r600_bytecode_add_alu(ctx.bc, &alu); 2292 2293 /* fetch vertex data from GSVS ring */ 2294 for (i = 0; i < ocnt; ++i) { 2295 struct r600_shader_io *out = &ctx.shader->output[i]; 2296 2297 out->gpr = i + 1; 2298 out->ring_offset = i * 16; 2299 2300 memset(&vtx, 0, sizeof(vtx)); 2301 vtx.op = FETCH_OP_VFETCH; 2302 vtx.buffer_id = R600_GS_RING_CONST_BUFFER; 2303 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2304 vtx.mega_fetch_count = 16; 2305 vtx.offset = out->ring_offset; 2306 vtx.dst_gpr = out->gpr; 2307 vtx.src_gpr = 0; 2308 vtx.dst_sel_x = 0; 2309 vtx.dst_sel_y = 1; 2310 vtx.dst_sel_z = 2; 2311 vtx.dst_sel_w = 3; 2312 if (rctx->b.chip_class >= EVERGREEN) { 2313 vtx.use_const_fields = 1; 2314 } else { 2315 vtx.data_format = FMT_32_32_32_32_FLOAT; 2316 } 2317 2318 r600_bytecode_add_vtx(ctx.bc, &vtx); 2319 } 2320 ctx.temp_reg = i + 1; 2321 for (ring = 3; ring >= 0; --ring) { 2322 bool enabled = false; 2323 for (i = 0; i < so->num_outputs; i++) { 2324 if (so->output[i].stream == ring) { 2325 enabled = true; 2326 if (ring > 0) 2327 only_ring_0 = false; 2328 break; 2329 } 2330 } 2331 if (ring != 0 && !enabled) { 2332 cshader->shader.ring_item_sizes[ring] = 0; 2333 continue; 2334 } 2335 2336 if (cf_jump) { 2337 // Patch up jump label 2338 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2339 cf_pop = ctx.bc->cf_last; 2340 2341 cf_jump->cf_addr = cf_pop->id + 2; 2342 cf_jump->pop_count = 1; 2343 cf_pop->cf_addr = cf_pop->id + 2; 2344 cf_pop->pop_count = 1; 2345 } 2346 2347 /* PRED_SETE_INT __, R0.y, ring */ 2348 memset(&alu, 0, sizeof(alu)); 2349 alu.op = ALU_OP2_PRED_SETE_INT; 2350 alu.src[0].chan = 1; 2351 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2352 alu.src[1].value = ring; 2353 alu.execute_mask = 1; 2354 alu.update_pred = 1; 2355 alu.last = 1; 2356 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); 2357 2358 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); 2359 cf_jump = ctx.bc->cf_last; 2360 2361 if (enabled) 2362 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]); 2363 cshader->shader.ring_item_sizes[ring] = ocnt * 16; 2364 } 2365 2366 /* bc adds nops - copy it */ 2367 if (ctx.bc->chip_class == R600) { 2368 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2369 alu.op = ALU_OP0_NOP; 2370 alu.last = 1; 2371 r600_bytecode_add_alu(ctx.bc, &alu); 2372 2373 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2374 } 2375 2376 /* export vertex data */ 2377 /* XXX factor out common code with r600_shader_from_tgsi ? */ 2378 for (i = 0; i < ocnt; ++i) { 2379 struct r600_shader_io *out = &ctx.shader->output[i]; 2380 bool instream0 = true; 2381 if (out->name == TGSI_SEMANTIC_CLIPVERTEX) 2382 continue; 2383 2384 for (j = 0; j < so->num_outputs; j++) { 2385 if (so->output[j].register_index == i) { 2386 if (so->output[j].stream == 0) 2387 break; 2388 if (so->output[j].stream > 0) 2389 instream0 = false; 2390 } 2391 } 2392 if (!instream0) 2393 continue; 2394 memset(&output, 0, sizeof(output)); 2395 output.gpr = out->gpr; 2396 output.elem_size = 3; 2397 output.swizzle_x = 0; 2398 output.swizzle_y = 1; 2399 output.swizzle_z = 2; 2400 output.swizzle_w = 3; 2401 output.burst_count = 1; 2402 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2403 output.op = CF_OP_EXPORT; 2404 switch (out->name) { 2405 case TGSI_SEMANTIC_POSITION: 2406 output.array_base = 60; 2407 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2408 break; 2409 2410 case TGSI_SEMANTIC_PSIZE: 2411 output.array_base = 61; 2412 if (next_clip_pos == 61) 2413 next_clip_pos = 62; 2414 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2415 output.swizzle_y = 7; 2416 output.swizzle_z = 7; 2417 output.swizzle_w = 7; 2418 ctx.shader->vs_out_misc_write = 1; 2419 ctx.shader->vs_out_point_size = 1; 2420 break; 2421 case TGSI_SEMANTIC_LAYER: 2422 if (out->spi_sid) { 2423 /* duplicate it as PARAM to pass to the pixel shader */ 2424 output.array_base = next_param++; 2425 r600_bytecode_add_output(ctx.bc, &output); 2426 last_exp_param = ctx.bc->cf_last; 2427 } 2428 output.array_base = 61; 2429 if (next_clip_pos == 61) 2430 next_clip_pos = 62; 2431 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2432 output.swizzle_x = 7; 2433 output.swizzle_y = 7; 2434 output.swizzle_z = 0; 2435 output.swizzle_w = 7; 2436 ctx.shader->vs_out_misc_write = 1; 2437 ctx.shader->vs_out_layer = 1; 2438 break; 2439 case TGSI_SEMANTIC_VIEWPORT_INDEX: 2440 if (out->spi_sid) { 2441 /* duplicate it as PARAM to pass to the pixel shader */ 2442 output.array_base = next_param++; 2443 r600_bytecode_add_output(ctx.bc, &output); 2444 last_exp_param = ctx.bc->cf_last; 2445 } 2446 output.array_base = 61; 2447 if (next_clip_pos == 61) 2448 next_clip_pos = 62; 2449 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2450 ctx.shader->vs_out_misc_write = 1; 2451 ctx.shader->vs_out_viewport = 1; 2452 output.swizzle_x = 7; 2453 output.swizzle_y = 7; 2454 output.swizzle_z = 7; 2455 output.swizzle_w = 0; 2456 break; 2457 case TGSI_SEMANTIC_CLIPDIST: 2458 /* spi_sid is 0 for clipdistance outputs that were generated 2459 * for clipvertex - we don't need to pass them to PS */ 2460 ctx.shader->clip_dist_write = gs->shader.clip_dist_write; 2461 ctx.shader->cull_dist_write = gs->shader.cull_dist_write; 2462 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask; 2463 if (out->spi_sid) { 2464 /* duplicate it as PARAM to pass to the pixel shader */ 2465 output.array_base = next_param++; 2466 r600_bytecode_add_output(ctx.bc, &output); 2467 last_exp_param = ctx.bc->cf_last; 2468 } 2469 output.array_base = next_clip_pos++; 2470 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2471 break; 2472 case TGSI_SEMANTIC_FOG: 2473 output.swizzle_y = 4; /* 0 */ 2474 output.swizzle_z = 4; /* 0 */ 2475 output.swizzle_w = 5; /* 1 */ 2476 break; 2477 default: 2478 output.array_base = next_param++; 2479 break; 2480 } 2481 r600_bytecode_add_output(ctx.bc, &output); 2482 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) 2483 last_exp_param = ctx.bc->cf_last; 2484 else 2485 last_exp_pos = ctx.bc->cf_last; 2486 } 2487 2488 if (!last_exp_pos) { 2489 memset(&output, 0, sizeof(output)); 2490 output.gpr = 0; 2491 output.elem_size = 3; 2492 output.swizzle_x = 7; 2493 output.swizzle_y = 7; 2494 output.swizzle_z = 7; 2495 output.swizzle_w = 7; 2496 output.burst_count = 1; 2497 output.type = 2; 2498 output.op = CF_OP_EXPORT; 2499 output.array_base = 60; 2500 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 2501 r600_bytecode_add_output(ctx.bc, &output); 2502 last_exp_pos = ctx.bc->cf_last; 2503 } 2504 2505 if (!last_exp_param) { 2506 memset(&output, 0, sizeof(output)); 2507 output.gpr = 0; 2508 output.elem_size = 3; 2509 output.swizzle_x = 7; 2510 output.swizzle_y = 7; 2511 output.swizzle_z = 7; 2512 output.swizzle_w = 7; 2513 output.burst_count = 1; 2514 output.type = 2; 2515 output.op = CF_OP_EXPORT; 2516 output.array_base = next_param++; 2517 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 2518 r600_bytecode_add_output(ctx.bc, &output); 2519 last_exp_param = ctx.bc->cf_last; 2520 } 2521 2522 last_exp_pos->op = CF_OP_EXPORT_DONE; 2523 last_exp_param->op = CF_OP_EXPORT_DONE; 2524 2525 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); 2526 cf_pop = ctx.bc->cf_last; 2527 2528 cf_jump->cf_addr = cf_pop->id + 2; 2529 cf_jump->pop_count = 1; 2530 cf_pop->cf_addr = cf_pop->id + 2; 2531 cf_pop->pop_count = 1; 2532 2533 if (ctx.bc->chip_class == CAYMAN) 2534 cm_bytecode_add_cf_end(ctx.bc); 2535 else { 2536 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 2537 ctx.bc->cf_last->end_of_program = 1; 2538 } 2539 2540 gs->gs_copy_shader = cshader; 2541 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 2542 2543 ctx.bc->nstack = 1; 2544 2545 return r600_bytecode_build(ctx.bc); 2546 } 2547 2548 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind) 2549 { 2550 if (ind) { 2551 struct r600_bytecode_alu alu; 2552 int r; 2553 2554 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2555 alu.op = ALU_OP2_ADD_INT; 2556 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx]; 2557 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2558 alu.src[1].value = ctx->gs_out_ring_offset >> 4; 2559 alu.dst.sel = ctx->gs_export_gpr_tregs[idx]; 2560 alu.dst.write = 1; 2561 alu.last = 1; 2562 r = r600_bytecode_add_alu(ctx->bc, &alu); 2563 if (r) 2564 return r; 2565 } 2566 return 0; 2567 } 2568 2569 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind) 2570 { 2571 struct r600_bytecode_output output; 2572 int ring_offset; 2573 unsigned i, k; 2574 int effective_stream = stream == -1 ? 0 : stream; 2575 int idx = 0; 2576 2577 for (i = 0; i < ctx->shader->noutput; i++) { 2578 if (ctx->gs_for_vs) { 2579 /* for ES we need to lookup corresponding ring offset expected by GS 2580 * (map this output to GS input by name and sid) */ 2581 /* FIXME precompute offsets */ 2582 ring_offset = -1; 2583 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) { 2584 struct r600_shader_io *in = &ctx->gs_for_vs->input[k]; 2585 struct r600_shader_io *out = &ctx->shader->output[i]; 2586 if (in->name == out->name && in->sid == out->sid) 2587 ring_offset = in->ring_offset; 2588 } 2589 2590 if (ring_offset == -1) 2591 continue; 2592 } else { 2593 ring_offset = idx * 16; 2594 idx++; 2595 } 2596 2597 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION) 2598 continue; 2599 /* next_ring_offset after parsing input decls contains total size of 2600 * single vertex data, gs_next_vertex - current vertex index */ 2601 if (!ind) 2602 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex; 2603 2604 memset(&output, 0, sizeof(struct r600_bytecode_output)); 2605 output.gpr = ctx->shader->output[i].gpr; 2606 output.elem_size = 3; 2607 output.comp_mask = 0xF; 2608 output.burst_count = 1; 2609 2610 if (ind) 2611 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 2612 else 2613 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; 2614 2615 switch (stream) { 2616 default: 2617 case 0: 2618 output.op = CF_OP_MEM_RING; break; 2619 case 1: 2620 output.op = CF_OP_MEM_RING1; break; 2621 case 2: 2622 output.op = CF_OP_MEM_RING2; break; 2623 case 3: 2624 output.op = CF_OP_MEM_RING3; break; 2625 } 2626 2627 if (ind) { 2628 output.array_base = ring_offset >> 2; /* in dwords */ 2629 output.array_size = 0xfff; 2630 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream]; 2631 } else 2632 output.array_base = ring_offset >> 2; /* in dwords */ 2633 r600_bytecode_add_output(ctx->bc, &output); 2634 } 2635 2636 ++ctx->gs_next_vertex; 2637 return 0; 2638 } 2639 2640 2641 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) 2642 { 2643 int r; 2644 struct r600_bytecode_vtx vtx; 2645 int temp_val = ctx->temp_reg; 2646 /* need to store the TCS output somewhere */ 2647 r = single_alu_op2(ctx, ALU_OP1_MOV, 2648 temp_val, 0, 2649 V_SQ_ALU_SRC_LITERAL, 0, 2650 0, 0); 2651 if (r) 2652 return r; 2653 2654 /* used by VS/TCS */ 2655 if (ctx->tess_input_info) { 2656 /* fetch tcs input values into resv space */ 2657 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2658 vtx.op = FETCH_OP_VFETCH; 2659 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2660 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2661 vtx.mega_fetch_count = 16; 2662 vtx.data_format = FMT_32_32_32_32; 2663 vtx.num_format_all = 2; 2664 vtx.format_comp_all = 1; 2665 vtx.use_const_fields = 0; 2666 vtx.endian = r600_endian_swap(32); 2667 vtx.srf_mode_all = 1; 2668 vtx.offset = 0; 2669 vtx.dst_gpr = ctx->tess_input_info; 2670 vtx.dst_sel_x = 0; 2671 vtx.dst_sel_y = 1; 2672 vtx.dst_sel_z = 2; 2673 vtx.dst_sel_w = 3; 2674 vtx.src_gpr = temp_val; 2675 vtx.src_sel_x = 0; 2676 2677 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 2678 if (r) 2679 return r; 2680 } 2681 2682 /* used by TCS/TES */ 2683 if (ctx->tess_output_info) { 2684 /* fetch tcs output values into resv space */ 2685 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 2686 vtx.op = FETCH_OP_VFETCH; 2687 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; 2688 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 2689 vtx.mega_fetch_count = 16; 2690 vtx.data_format = FMT_32_32_32_32; 2691 vtx.num_format_all = 2; 2692 vtx.format_comp_all = 1; 2693 vtx.use_const_fields = 0; 2694 vtx.endian = r600_endian_swap(32); 2695 vtx.srf_mode_all = 1; 2696 vtx.offset = 16; 2697 vtx.dst_gpr = ctx->tess_output_info; 2698 vtx.dst_sel_x = 0; 2699 vtx.dst_sel_y = 1; 2700 vtx.dst_sel_z = 2; 2701 vtx.dst_sel_w = 3; 2702 vtx.src_gpr = temp_val; 2703 vtx.src_sel_x = 0; 2704 2705 r = r600_bytecode_add_vtx(ctx->bc, &vtx); 2706 if (r) 2707 return r; 2708 } 2709 return 0; 2710 } 2711 2712 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx) 2713 { 2714 int j, r; 2715 int temp_reg; 2716 unsigned i; 2717 2718 /* fetch tcs input values into input_vals */ 2719 ctx->tess_input_info = r600_get_temp(ctx); 2720 ctx->tess_output_info = 0; 2721 r = r600_fetch_tess_io_info(ctx); 2722 if (r) 2723 return r; 2724 2725 temp_reg = r600_get_temp(ctx); 2726 /* dst reg contains LDS address stride * idx */ 2727 /* MUL vertexID, vertex_dw_stride */ 2728 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, 2729 temp_reg, 0, 2730 ctx->tess_input_info, 1, 2731 0, 1); /* rel id in r0.y? */ 2732 if (r) 2733 return r; 2734 2735 for (i = 0; i < ctx->shader->noutput; i++) { 2736 struct r600_bytecode_alu alu; 2737 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid); 2738 2739 if (param) { 2740 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2741 temp_reg, 1, 2742 temp_reg, 0, 2743 V_SQ_ALU_SRC_LITERAL, param * 16); 2744 if (r) 2745 return r; 2746 } 2747 2748 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2749 temp_reg, 2, 2750 temp_reg, param ? 1 : 0, 2751 V_SQ_ALU_SRC_LITERAL, 8); 2752 if (r) 2753 return r; 2754 2755 2756 for (j = 0; j < 2; j++) { 2757 int chan = (j == 1) ? 2 : (param ? 1 : 0); 2758 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2759 alu.op = LDS_OP3_LDS_WRITE_REL; 2760 alu.src[0].sel = temp_reg; 2761 alu.src[0].chan = chan; 2762 alu.src[1].sel = ctx->shader->output[i].gpr; 2763 alu.src[1].chan = j * 2; 2764 alu.src[2].sel = ctx->shader->output[i].gpr; 2765 alu.src[2].chan = (j * 2) + 1; 2766 alu.last = 1; 2767 alu.dst.chan = 0; 2768 alu.lds_idx = 1; 2769 alu.is_lds_idx_op = true; 2770 r = r600_bytecode_add_alu(ctx->bc, &alu); 2771 if (r) 2772 return r; 2773 } 2774 } 2775 return 0; 2776 } 2777 2778 static int r600_store_tcs_output(struct r600_shader_ctx *ctx) 2779 { 2780 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 2781 const struct tgsi_full_dst_register *dst = &inst->Dst[0]; 2782 int i, r, lasti; 2783 int temp_reg = r600_get_temp(ctx); 2784 struct r600_bytecode_alu alu; 2785 unsigned write_mask = dst->Register.WriteMask; 2786 2787 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT) 2788 return 0; 2789 2790 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true); 2791 if (r) 2792 return r; 2793 2794 /* the base address is now in temp.x */ 2795 r = r600_get_byte_address(ctx, temp_reg, 2796 &inst->Dst[0], NULL, ctx->tess_output_info, 1); 2797 if (r) 2798 return r; 2799 2800 /* LDS write */ 2801 lasti = tgsi_last_instruction(write_mask); 2802 for (i = 1; i <= lasti; i++) { 2803 2804 if (!(write_mask & (1 << i))) 2805 continue; 2806 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2807 temp_reg, i, 2808 temp_reg, 0, 2809 V_SQ_ALU_SRC_LITERAL, 4 * i); 2810 if (r) 2811 return r; 2812 } 2813 2814 for (i = 0; i <= lasti; i++) { 2815 if (!(write_mask & (1 << i))) 2816 continue; 2817 2818 if ((i == 0 && ((write_mask & 3) == 3)) || 2819 (i == 2 && ((write_mask & 0xc) == 0xc))) { 2820 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2821 alu.op = LDS_OP3_LDS_WRITE_REL; 2822 alu.src[0].sel = temp_reg; 2823 alu.src[0].chan = i; 2824 2825 alu.src[1].sel = dst->Register.Index; 2826 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 2827 alu.src[1].chan = i; 2828 2829 alu.src[2].sel = dst->Register.Index; 2830 alu.src[2].sel += ctx->file_offset[dst->Register.File]; 2831 alu.src[2].chan = i + 1; 2832 alu.lds_idx = 1; 2833 alu.dst.chan = 0; 2834 alu.last = 1; 2835 alu.is_lds_idx_op = true; 2836 r = r600_bytecode_add_alu(ctx->bc, &alu); 2837 if (r) 2838 return r; 2839 i += 1; 2840 continue; 2841 } 2842 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 2843 alu.op = LDS_OP2_LDS_WRITE; 2844 alu.src[0].sel = temp_reg; 2845 alu.src[0].chan = i; 2846 2847 alu.src[1].sel = dst->Register.Index; 2848 alu.src[1].sel += ctx->file_offset[dst->Register.File]; 2849 alu.src[1].chan = i; 2850 2851 alu.src[2].sel = V_SQ_ALU_SRC_0; 2852 alu.dst.chan = 0; 2853 alu.last = 1; 2854 alu.is_lds_idx_op = true; 2855 r = r600_bytecode_add_alu(ctx->bc, &alu); 2856 if (r) 2857 return r; 2858 } 2859 return 0; 2860 } 2861 2862 static int r600_tess_factor_read(struct r600_shader_ctx *ctx, 2863 int output_idx, int nc) 2864 { 2865 int param; 2866 unsigned temp_reg = r600_get_temp(ctx); 2867 unsigned name = ctx->shader->output[output_idx].name; 2868 int dreg = ctx->shader->output[output_idx].gpr; 2869 int r; 2870 2871 param = r600_get_lds_unique_index(name, 0); 2872 r = get_lds_offset0(ctx, 1, temp_reg, true); 2873 if (r) 2874 return r; 2875 2876 if (param) { 2877 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2878 temp_reg, 0, 2879 temp_reg, 0, 2880 V_SQ_ALU_SRC_LITERAL, param * 16); 2881 if (r) 2882 return r; 2883 } 2884 2885 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1)); 2886 return 0; 2887 } 2888 2889 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx) 2890 { 2891 int stride, outer_comps, inner_comps; 2892 int tessinner_idx = -1, tessouter_idx = -1; 2893 int i, r; 2894 unsigned j; 2895 int temp_reg = r600_get_temp(ctx); 2896 int treg[3] = {-1, -1, -1}; 2897 struct r600_bytecode_alu alu; 2898 struct r600_bytecode_cf *cf_jump, *cf_pop; 2899 2900 /* only execute factor emission for invocation 0 */ 2901 /* PRED_SETE_INT __, R0.x, 0 */ 2902 memset(&alu, 0, sizeof(alu)); 2903 alu.op = ALU_OP2_PRED_SETE_INT; 2904 alu.src[0].chan = 2; 2905 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2906 alu.execute_mask = 1; 2907 alu.update_pred = 1; 2908 alu.last = 1; 2909 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE); 2910 2911 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 2912 cf_jump = ctx->bc->cf_last; 2913 2914 treg[0] = r600_get_temp(ctx); 2915 switch (ctx->shader->tcs_prim_mode) { 2916 case PIPE_PRIM_LINES: 2917 stride = 8; /* 2 dwords, 1 vec2 store */ 2918 outer_comps = 2; 2919 inner_comps = 0; 2920 break; 2921 case PIPE_PRIM_TRIANGLES: 2922 stride = 16; /* 4 dwords, 1 vec4 store */ 2923 outer_comps = 3; 2924 inner_comps = 1; 2925 treg[1] = r600_get_temp(ctx); 2926 break; 2927 case PIPE_PRIM_QUADS: 2928 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */ 2929 outer_comps = 4; 2930 inner_comps = 2; 2931 treg[1] = r600_get_temp(ctx); 2932 treg[2] = r600_get_temp(ctx); 2933 break; 2934 default: 2935 assert(0); 2936 return -1; 2937 } 2938 2939 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */ 2940 /* TF_WRITE takes index in R.x, value in R.y */ 2941 for (j = 0; j < ctx->shader->noutput; j++) { 2942 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER) 2943 tessinner_idx = j; 2944 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER) 2945 tessouter_idx = j; 2946 } 2947 2948 if (tessouter_idx == -1) 2949 return -1; 2950 2951 if (tessinner_idx == -1 && inner_comps) 2952 return -1; 2953 2954 if (tessouter_idx != -1) { 2955 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps); 2956 if (r) 2957 return r; 2958 } 2959 2960 if (tessinner_idx != -1) { 2961 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps); 2962 if (r) 2963 return r; 2964 } 2965 2966 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */ 2967 /* r.x = relpatchid(r0.y) * tf_stride */ 2968 2969 /* multiply incoming r0.y * stride - t.x = r0.y * stride */ 2970 /* add incoming r0.w to it: t.x = t.x + r0.w */ 2971 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 2972 temp_reg, 0, 2973 0, 1, 2974 V_SQ_ALU_SRC_LITERAL, stride, 2975 0, 3); 2976 if (r) 2977 return r; 2978 2979 for (i = 0; i < outer_comps + inner_comps; i++) { 2980 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx; 2981 int out_comp = i >= outer_comps ? i - outer_comps : i; 2982 2983 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) { 2984 if (out_comp == 1) 2985 out_comp = 0; 2986 else if (out_comp == 0) 2987 out_comp = 1; 2988 } 2989 2990 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 2991 treg[i / 2], (2 * (i % 2)), 2992 temp_reg, 0, 2993 V_SQ_ALU_SRC_LITERAL, 4 * i); 2994 if (r) 2995 return r; 2996 r = single_alu_op2(ctx, ALU_OP1_MOV, 2997 treg[i / 2], 1 + (2 * (i%2)), 2998 ctx->shader->output[out_idx].gpr, out_comp, 2999 0, 0); 3000 if (r) 3001 return r; 3002 } 3003 for (i = 0; i < outer_comps + inner_comps; i++) { 3004 struct r600_bytecode_gds gds; 3005 3006 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 3007 gds.src_gpr = treg[i / 2]; 3008 gds.src_sel_x = 2 * (i % 2); 3009 gds.src_sel_y = 1 + (2 * (i % 2)); 3010 gds.src_sel_z = 4; 3011 gds.dst_sel_x = 7; 3012 gds.dst_sel_y = 7; 3013 gds.dst_sel_z = 7; 3014 gds.dst_sel_w = 7; 3015 gds.op = FETCH_OP_TF_WRITE; 3016 r = r600_bytecode_add_gds(ctx->bc, &gds); 3017 if (r) 3018 return r; 3019 } 3020 3021 // Patch up jump label 3022 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 3023 cf_pop = ctx->bc->cf_last; 3024 3025 cf_jump->cf_addr = cf_pop->id + 2; 3026 cf_jump->pop_count = 1; 3027 cf_pop->cf_addr = cf_pop->id + 2; 3028 cf_pop->pop_count = 1; 3029 3030 return 0; 3031 } 3032 3033 /* 3034 * We have to work out the thread ID for load and atomic 3035 * operations, which store the returned value to an index 3036 * in an intermediate buffer. 3037 * The index is calculated by taking the thread id, 3038 * calculated from the MBCNT instructions. 3039 * Then the shader engine ID is multiplied by 256, 3040 * and the wave id is added. 3041 * Then the result is multipled by 64 and thread id is 3042 * added. 3043 */ 3044 static int load_thread_id_gpr(struct r600_shader_ctx *ctx) 3045 { 3046 struct r600_bytecode_alu alu; 3047 int r; 3048 3049 if (ctx->thread_id_gpr_loaded) 3050 return 0; 3051 3052 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3053 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT; 3054 alu.dst.sel = ctx->temp_reg; 3055 alu.dst.chan = 0; 3056 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3057 alu.src[0].value = 0xffffffff; 3058 alu.dst.write = 1; 3059 r = r600_bytecode_add_alu(ctx->bc, &alu); 3060 if (r) 3061 return r; 3062 3063 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3064 alu.op = ALU_OP1_MBCNT_32HI_INT; 3065 alu.dst.sel = ctx->temp_reg; 3066 alu.dst.chan = 1; 3067 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3068 alu.src[0].value = 0xffffffff; 3069 alu.dst.write = 1; 3070 r = r600_bytecode_add_alu(ctx->bc, &alu); 3071 if (r) 3072 return r; 3073 3074 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3075 alu.op = ALU_OP3_MULADD_UINT24; 3076 alu.dst.sel = ctx->temp_reg; 3077 alu.dst.chan = 2; 3078 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID; 3079 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 3080 alu.src[1].value = 256; 3081 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID; 3082 alu.dst.write = 1; 3083 alu.is_op3 = 1; 3084 alu.last = 1; 3085 r = r600_bytecode_add_alu(ctx->bc, &alu); 3086 if (r) 3087 return r; 3088 3089 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, 3090 ctx->thread_id_gpr, 1, 3091 ctx->temp_reg, 2, 3092 V_SQ_ALU_SRC_LITERAL, 0x40, 3093 ctx->temp_reg, 0); 3094 if (r) 3095 return r; 3096 ctx->thread_id_gpr_loaded = true; 3097 return 0; 3098 } 3099 3100 static int r600_shader_from_tgsi(struct r600_context *rctx, 3101 struct r600_pipe_shader *pipeshader, 3102 union r600_shader_key key) 3103 { 3104 struct r600_screen *rscreen = rctx->screen; 3105 struct r600_shader *shader = &pipeshader->shader; 3106 struct tgsi_token *tokens = pipeshader->selector->tokens; 3107 struct pipe_stream_output_info so = pipeshader->selector->so; 3108 struct tgsi_full_immediate *immediate; 3109 struct r600_shader_ctx ctx; 3110 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)]; 3111 unsigned output_done, noutput; 3112 unsigned opcode; 3113 int j, k, r = 0; 3114 unsigned i; 3115 int next_param_base = 0, next_clip_base; 3116 int max_color_exports = MAX2(key.ps.nr_cbufs, 1); 3117 bool indirect_gprs; 3118 bool ring_outputs = false; 3119 bool lds_outputs = false; 3120 bool lds_inputs = false; 3121 bool pos_emitted = false; 3122 3123 ctx.bc = &shader->bc; 3124 ctx.shader = shader; 3125 3126 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, 3127 rscreen->has_compressed_msaa_texturing); 3128 ctx.tokens = tokens; 3129 tgsi_scan_shader(tokens, &ctx.info); 3130 shader->indirect_files = ctx.info.indirect_files; 3131 3132 shader->uses_doubles = ctx.info.uses_doubles; 3133 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC]; 3134 shader->nsys_inputs = 0; 3135 3136 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 || 3137 ctx.info.file_count[TGSI_FILE_BUFFER] > 0; 3138 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); 3139 tgsi_parse_init(&ctx.parse, tokens); 3140 ctx.type = ctx.info.processor; 3141 shader->processor_type = ctx.type; 3142 ctx.bc->type = shader->processor_type; 3143 3144 switch (ctx.type) { 3145 case PIPE_SHADER_VERTEX: 3146 shader->vs_as_gs_a = key.vs.as_gs_a; 3147 shader->vs_as_es = key.vs.as_es; 3148 shader->vs_as_ls = key.vs.as_ls; 3149 shader->atomic_base = key.vs.first_atomic_counter; 3150 if (shader->vs_as_es) 3151 ring_outputs = true; 3152 if (shader->vs_as_ls) 3153 lds_outputs = true; 3154 break; 3155 case PIPE_SHADER_GEOMETRY: 3156 ring_outputs = true; 3157 shader->atomic_base = key.gs.first_atomic_counter; 3158 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix; 3159 break; 3160 case PIPE_SHADER_TESS_CTRL: 3161 shader->tcs_prim_mode = key.tcs.prim_mode; 3162 shader->atomic_base = key.tcs.first_atomic_counter; 3163 lds_outputs = true; 3164 lds_inputs = true; 3165 break; 3166 case PIPE_SHADER_TESS_EVAL: 3167 shader->tes_as_es = key.tes.as_es; 3168 shader->atomic_base = key.tes.first_atomic_counter; 3169 lds_inputs = true; 3170 if (shader->tes_as_es) 3171 ring_outputs = true; 3172 break; 3173 case PIPE_SHADER_FRAGMENT: 3174 shader->two_side = key.ps.color_two_side; 3175 shader->atomic_base = key.ps.first_atomic_counter; 3176 shader->rat_base = key.ps.nr_cbufs; 3177 shader->image_size_const_offset = key.ps.image_size_const_offset; 3178 break; 3179 case PIPE_SHADER_COMPUTE: 3180 shader->rat_base = 0; 3181 shader->image_size_const_offset = 0; 3182 break; 3183 default: 3184 break; 3185 } 3186 3187 if (shader->vs_as_es || shader->tes_as_es) { 3188 ctx.gs_for_vs = &rctx->gs_shader->current->shader; 3189 } else { 3190 ctx.gs_for_vs = NULL; 3191 } 3192 3193 ctx.next_ring_offset = 0; 3194 ctx.gs_out_ring_offset = 0; 3195 ctx.gs_next_vertex = 0; 3196 ctx.gs_stream_output_info = &so; 3197 3198 ctx.face_gpr = -1; 3199 ctx.fixed_pt_position_gpr = -1; 3200 ctx.fragcoord_input = -1; 3201 ctx.colors_used = 0; 3202 ctx.clip_vertex_write = 0; 3203 ctx.thread_id_gpr_loaded = false; 3204 3205 ctx.cs_block_size_reg = -1; 3206 ctx.cs_grid_size_reg = -1; 3207 ctx.cs_block_size_loaded = false; 3208 ctx.cs_grid_size_loaded = false; 3209 3210 shader->nr_ps_color_exports = 0; 3211 shader->nr_ps_max_color_exports = 0; 3212 3213 3214 /* register allocations */ 3215 /* Values [0,127] correspond to GPR[0..127]. 3216 * Values [128,159] correspond to constant buffer bank 0 3217 * Values [160,191] correspond to constant buffer bank 1 3218 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG) 3219 * Values [256,287] correspond to constant buffer bank 2 (EG) 3220 * Values [288,319] correspond to constant buffer bank 3 (EG) 3221 * Other special values are shown in the list below. 3222 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) 3223 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) 3224 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) 3225 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) 3226 * 248 SQ_ALU_SRC_0: special constant 0.0. 3227 * 249 SQ_ALU_SRC_1: special constant 1.0 float. 3228 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. 3229 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. 3230 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. 3231 * 253 SQ_ALU_SRC_LITERAL: literal constant. 3232 * 254 SQ_ALU_SRC_PV: previous vector result. 3233 * 255 SQ_ALU_SRC_PS: previous scalar result. 3234 */ 3235 for (i = 0; i < TGSI_FILE_COUNT; i++) { 3236 ctx.file_offset[i] = 0; 3237 } 3238 3239 if (ctx.type == PIPE_SHADER_VERTEX) { 3240 3241 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3242 if (ctx.info.num_inputs) 3243 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); 3244 } 3245 if (ctx.type == PIPE_SHADER_FRAGMENT) { 3246 if (ctx.bc->chip_class >= EVERGREEN) 3247 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); 3248 else 3249 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]); 3250 } 3251 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3252 /* FIXME 1 would be enough in some cases (3 or less input vertices) */ 3253 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3254 } 3255 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3256 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3257 if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3258 bool add_tesscoord = false, add_tess_inout = false; 3259 ctx.file_offset[TGSI_FILE_INPUT] = 1; 3260 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3261 /* if we have tesscoord save one reg */ 3262 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD) 3263 add_tesscoord = true; 3264 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER || 3265 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER) 3266 add_tess_inout = true; 3267 } 3268 if (add_tesscoord || add_tess_inout) 3269 ctx.file_offset[TGSI_FILE_INPUT]++; 3270 if (add_tess_inout) 3271 ctx.file_offset[TGSI_FILE_INPUT]+=2; 3272 } 3273 if (ctx.type == PIPE_SHADER_COMPUTE) { 3274 ctx.file_offset[TGSI_FILE_INPUT] = 2; 3275 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { 3276 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE) 3277 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3278 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE) 3279 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++; 3280 } 3281 } 3282 3283 ctx.file_offset[TGSI_FILE_OUTPUT] = 3284 ctx.file_offset[TGSI_FILE_INPUT] + 3285 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3286 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + 3287 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; 3288 3289 /* Outside the GPR range. This will be translated to one of the 3290 * kcache banks later. */ 3291 ctx.file_offset[TGSI_FILE_CONSTANT] = 512; 3292 3293 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; 3294 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + 3295 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1; 3296 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1; 3297 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2; 3298 3299 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3300 ctx.tess_input_info = ctx.bc->ar_reg + 3; 3301 ctx.tess_output_info = ctx.bc->ar_reg + 4; 3302 ctx.temp_reg = ctx.bc->ar_reg + 5; 3303 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { 3304 ctx.tess_input_info = 0; 3305 ctx.tess_output_info = ctx.bc->ar_reg + 3; 3306 ctx.temp_reg = ctx.bc->ar_reg + 4; 3307 } else if (ctx.type == PIPE_SHADER_GEOMETRY) { 3308 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3; 3309 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4; 3310 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5; 3311 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6; 3312 ctx.temp_reg = ctx.bc->ar_reg + 7; 3313 if (ctx.shader->gs_tri_strip_adj_fix) { 3314 ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7; 3315 ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8; 3316 ctx.temp_reg += 2; 3317 } else { 3318 ctx.gs_rotated_input[0] = 0; 3319 ctx.gs_rotated_input[1] = 1; 3320 } 3321 } else { 3322 ctx.temp_reg = ctx.bc->ar_reg + 3; 3323 } 3324 3325 if (shader->uses_images) { 3326 ctx.thread_id_gpr = ctx.temp_reg++; 3327 ctx.thread_id_gpr_loaded = false; 3328 } 3329 3330 shader->max_arrays = 0; 3331 shader->num_arrays = 0; 3332 if (indirect_gprs) { 3333 3334 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) { 3335 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT], 3336 ctx.file_offset[TGSI_FILE_OUTPUT] - 3337 ctx.file_offset[TGSI_FILE_INPUT], 3338 0x0F); 3339 } 3340 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) { 3341 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT], 3342 ctx.file_offset[TGSI_FILE_TEMPORARY] - 3343 ctx.file_offset[TGSI_FILE_OUTPUT], 3344 0x0F); 3345 } 3346 } 3347 3348 ctx.nliterals = 0; 3349 ctx.literals = NULL; 3350 ctx.max_driver_temp_used = 0; 3351 3352 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && 3353 ctx.info.colors_written == 1; 3354 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; 3355 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; 3356 3357 if (ctx.type == PIPE_SHADER_VERTEX || 3358 ctx.type == PIPE_SHADER_GEOMETRY || 3359 ctx.type == PIPE_SHADER_TESS_EVAL) { 3360 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] + 3361 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1; 3362 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1; 3363 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]; 3364 } 3365 3366 if (shader->vs_as_gs_a) 3367 vs_add_primid_output(&ctx, key.vs.prim_id_out); 3368 3369 if (ctx.type == PIPE_SHADER_TESS_EVAL) 3370 r600_fetch_tess_io_info(&ctx); 3371 3372 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3373 tgsi_parse_token(&ctx.parse); 3374 switch (ctx.parse.FullToken.Token.Type) { 3375 case TGSI_TOKEN_TYPE_IMMEDIATE: 3376 immediate = &ctx.parse.FullToken.FullImmediate; 3377 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); 3378 if(ctx.literals == NULL) { 3379 r = -ENOMEM; 3380 goto out_err; 3381 } 3382 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; 3383 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; 3384 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; 3385 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; 3386 ctx.nliterals++; 3387 break; 3388 case TGSI_TOKEN_TYPE_DECLARATION: 3389 r = tgsi_declaration(&ctx); 3390 if (r) 3391 goto out_err; 3392 break; 3393 case TGSI_TOKEN_TYPE_INSTRUCTION: 3394 case TGSI_TOKEN_TYPE_PROPERTY: 3395 break; 3396 default: 3397 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); 3398 r = -EINVAL; 3399 goto out_err; 3400 } 3401 } 3402 3403 shader->ring_item_sizes[0] = ctx.next_ring_offset; 3404 shader->ring_item_sizes[1] = 0; 3405 shader->ring_item_sizes[2] = 0; 3406 shader->ring_item_sizes[3] = 0; 3407 3408 /* Process two side if needed */ 3409 if (shader->two_side && ctx.colors_used) { 3410 int i, count = ctx.shader->ninput; 3411 unsigned next_lds_loc = ctx.shader->nlds; 3412 3413 /* additional inputs will be allocated right after the existing inputs, 3414 * we won't need them after the color selection, so we don't need to 3415 * reserve these gprs for the rest of the shader code and to adjust 3416 * output offsets etc. */ 3417 int gpr = ctx.file_offset[TGSI_FILE_INPUT] + 3418 ctx.info.file_max[TGSI_FILE_INPUT] + 1; 3419 3420 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */ 3421 if (ctx.face_gpr == -1) { 3422 i = ctx.shader->ninput++; 3423 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE; 3424 ctx.shader->input[i].spi_sid = 0; 3425 ctx.shader->input[i].gpr = gpr++; 3426 ctx.face_gpr = ctx.shader->input[i].gpr; 3427 } 3428 3429 for (i = 0; i < count; i++) { 3430 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) { 3431 int ni = ctx.shader->ninput++; 3432 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io)); 3433 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR; 3434 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]); 3435 ctx.shader->input[ni].gpr = gpr++; 3436 // TGSI to LLVM needs to know the lds position of inputs. 3437 // Non LLVM path computes it later (in process_twoside_color) 3438 ctx.shader->input[ni].lds_pos = next_lds_loc++; 3439 ctx.shader->input[i].back_color_input = ni; 3440 if (ctx.bc->chip_class >= EVERGREEN) { 3441 if ((r = evergreen_interp_input(&ctx, ni))) 3442 return r; 3443 } 3444 } 3445 } 3446 } 3447 3448 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) 3449 shader->nr_ps_max_color_exports = 8; 3450 3451 if (ctx.fragcoord_input >= 0) { 3452 if (ctx.bc->chip_class == CAYMAN) { 3453 for (j = 0 ; j < 4; j++) { 3454 struct r600_bytecode_alu alu; 3455 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3456 alu.op = ALU_OP1_RECIP_IEEE; 3457 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3458 alu.src[0].chan = 3; 3459 3460 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3461 alu.dst.chan = j; 3462 alu.dst.write = (j == 3); 3463 alu.last = (j == 3); 3464 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3465 return r; 3466 } 3467 } else { 3468 struct r600_bytecode_alu alu; 3469 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3470 alu.op = ALU_OP1_RECIP_IEEE; 3471 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; 3472 alu.src[0].chan = 3; 3473 3474 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; 3475 alu.dst.chan = 3; 3476 alu.dst.write = 1; 3477 alu.last = 1; 3478 if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) 3479 return r; 3480 } 3481 } 3482 3483 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3484 struct r600_bytecode_alu alu; 3485 int r; 3486 3487 /* GS thread with no output workaround - emit a cut at start of GS */ 3488 if (ctx.bc->chip_class == R600) 3489 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); 3490 3491 for (j = 0; j < 4; j++) { 3492 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3493 alu.op = ALU_OP1_MOV; 3494 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 3495 alu.src[0].value = 0; 3496 alu.dst.sel = ctx.gs_export_gpr_tregs[j]; 3497 alu.dst.write = 1; 3498 alu.last = 1; 3499 r = r600_bytecode_add_alu(ctx.bc, &alu); 3500 if (r) 3501 return r; 3502 } 3503 3504 if (ctx.shader->gs_tri_strip_adj_fix) { 3505 r = single_alu_op2(&ctx, ALU_OP2_AND_INT, 3506 ctx.gs_rotated_input[0], 2, 3507 0, 2, 3508 V_SQ_ALU_SRC_LITERAL, 1); 3509 if (r) 3510 return r; 3511 3512 for (i = 0; i < 6; i++) { 3513 int rotated = (i + 4) % 6; 3514 int offset_reg = i / 3; 3515 int offset_chan = i % 3; 3516 int rotated_offset_reg = rotated / 3; 3517 int rotated_offset_chan = rotated % 3; 3518 3519 if (offset_reg == 0 && offset_chan == 2) 3520 offset_chan = 3; 3521 if (rotated_offset_reg == 0 && rotated_offset_chan == 2) 3522 rotated_offset_chan = 3; 3523 3524 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT, 3525 ctx.gs_rotated_input[offset_reg], offset_chan, 3526 ctx.gs_rotated_input[0], 2, 3527 offset_reg, offset_chan, 3528 rotated_offset_reg, rotated_offset_chan); 3529 if (r) 3530 return r; 3531 } 3532 } 3533 } 3534 3535 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3536 r600_fetch_tess_io_info(&ctx); 3537 3538 if (shader->two_side && ctx.colors_used) { 3539 if ((r = process_twoside_color_inputs(&ctx))) 3540 return r; 3541 } 3542 3543 tgsi_parse_init(&ctx.parse, tokens); 3544 while (!tgsi_parse_end_of_tokens(&ctx.parse)) { 3545 tgsi_parse_token(&ctx.parse); 3546 switch (ctx.parse.FullToken.Token.Type) { 3547 case TGSI_TOKEN_TYPE_INSTRUCTION: 3548 r = tgsi_is_supported(&ctx); 3549 if (r) 3550 goto out_err; 3551 ctx.max_driver_temp_used = 0; 3552 /* reserve first tmp for everyone */ 3553 r600_get_temp(&ctx); 3554 3555 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; 3556 if ((r = tgsi_split_constant(&ctx))) 3557 goto out_err; 3558 if ((r = tgsi_split_literal_constant(&ctx))) 3559 goto out_err; 3560 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3561 if ((r = tgsi_split_gs_inputs(&ctx))) 3562 goto out_err; 3563 } else if (lds_inputs) { 3564 if ((r = tgsi_split_lds_inputs(&ctx))) 3565 goto out_err; 3566 } 3567 if (ctx.bc->chip_class == CAYMAN) 3568 ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; 3569 else if (ctx.bc->chip_class >= EVERGREEN) 3570 ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; 3571 else 3572 ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; 3573 r = ctx.inst_info->process(&ctx); 3574 if (r) 3575 goto out_err; 3576 3577 if (ctx.type == PIPE_SHADER_TESS_CTRL) { 3578 r = r600_store_tcs_output(&ctx); 3579 if (r) 3580 goto out_err; 3581 } 3582 break; 3583 default: 3584 break; 3585 } 3586 } 3587 3588 /* Reset the temporary register counter. */ 3589 ctx.max_driver_temp_used = 0; 3590 3591 noutput = shader->noutput; 3592 3593 if (!ring_outputs && ctx.clip_vertex_write) { 3594 unsigned clipdist_temp[2]; 3595 3596 clipdist_temp[0] = r600_get_temp(&ctx); 3597 clipdist_temp[1] = r600_get_temp(&ctx); 3598 3599 /* need to convert a clipvertex write into clipdistance writes and not export 3600 the clip vertex anymore */ 3601 3602 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io)); 3603 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 3604 shader->output[noutput].gpr = clipdist_temp[0]; 3605 noutput++; 3606 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST; 3607 shader->output[noutput].gpr = clipdist_temp[1]; 3608 noutput++; 3609 3610 /* reset spi_sid for clipvertex output to avoid confusing spi */ 3611 shader->output[ctx.cv_output].spi_sid = 0; 3612 3613 shader->clip_dist_write = 0xFF; 3614 shader->cc_dist_mask = 0xFF; 3615 3616 for (i = 0; i < 8; i++) { 3617 int oreg = i >> 2; 3618 int ochan = i & 3; 3619 3620 for (j = 0; j < 4; j++) { 3621 struct r600_bytecode_alu alu; 3622 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 3623 alu.op = ALU_OP2_DOT4; 3624 alu.src[0].sel = shader->output[ctx.cv_output].gpr; 3625 alu.src[0].chan = j; 3626 3627 alu.src[1].sel = 512 + i; 3628 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 3629 alu.src[1].chan = j; 3630 3631 alu.dst.sel = clipdist_temp[oreg]; 3632 alu.dst.chan = j; 3633 alu.dst.write = (j == ochan); 3634 if (j == 3) 3635 alu.last = 1; 3636 r = r600_bytecode_add_alu(ctx.bc, &alu); 3637 if (r) 3638 return r; 3639 } 3640 } 3641 } 3642 3643 /* Add stream outputs. */ 3644 if (so.num_outputs) { 3645 bool emit = false; 3646 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX) 3647 emit = true; 3648 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL) 3649 emit = true; 3650 if (emit) 3651 emit_streamout(&ctx, &so, -1, NULL); 3652 } 3653 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; 3654 convert_edgeflag_to_int(&ctx); 3655 3656 if (ctx.type == PIPE_SHADER_TESS_CTRL) 3657 r600_emit_tess_factor(&ctx); 3658 3659 if (lds_outputs) { 3660 if (ctx.type == PIPE_SHADER_VERTEX) { 3661 if (ctx.shader->noutput) 3662 emit_lds_vs_writes(&ctx); 3663 } 3664 } else if (ring_outputs) { 3665 if (shader->vs_as_es || shader->tes_as_es) { 3666 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx); 3667 ctx.gs_export_gpr_tregs[1] = -1; 3668 ctx.gs_export_gpr_tregs[2] = -1; 3669 ctx.gs_export_gpr_tregs[3] = -1; 3670 3671 emit_gs_ring_writes(&ctx, &so, -1, FALSE); 3672 } 3673 } else { 3674 /* Export output */ 3675 next_clip_base = shader->vs_out_misc_write ? 62 : 61; 3676 3677 for (i = 0, j = 0; i < noutput; i++, j++) { 3678 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 3679 output[j].gpr = shader->output[i].gpr; 3680 output[j].elem_size = 3; 3681 output[j].swizzle_x = 0; 3682 output[j].swizzle_y = 1; 3683 output[j].swizzle_z = 2; 3684 output[j].swizzle_w = 3; 3685 output[j].burst_count = 1; 3686 output[j].type = 0xffffffff; 3687 output[j].op = CF_OP_EXPORT; 3688 switch (ctx.type) { 3689 case PIPE_SHADER_VERTEX: 3690 case PIPE_SHADER_TESS_EVAL: 3691 switch (shader->output[i].name) { 3692 case TGSI_SEMANTIC_POSITION: 3693 output[j].array_base = 60; 3694 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 3695 pos_emitted = true; 3696 break; 3697 3698 case TGSI_SEMANTIC_PSIZE: 3699 output[j].array_base = 61; 3700 output[j].swizzle_y = 7; 3701 output[j].swizzle_z = 7; 3702 output[j].swizzle_w = 7; 3703 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 3704 pos_emitted = true; 3705 break; 3706 case TGSI_SEMANTIC_EDGEFLAG: 3707 output[j].array_base = 61; 3708 output[j].swizzle_x = 7; 3709 output[j].swizzle_y = 0; 3710 output[j].swizzle_z = 7; 3711 output[j].swizzle_w = 7; 3712 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 3713 pos_emitted = true; 3714 break; 3715 case TGSI_SEMANTIC_LAYER: 3716 /* spi_sid is 0 for outputs that are 3717 * not consumed by PS */ 3718 if (shader->output[i].spi_sid) { 3719 output[j].array_base = next_param_base++; 3720 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 3721 j++; 3722 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 3723 } 3724 output[j].array_base = 61; 3725 output[j].swizzle_x = 7; 3726 output[j].swizzle_y = 7; 3727 output[j].swizzle_z = 0; 3728 output[j].swizzle_w = 7; 3729 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 3730 pos_emitted = true; 3731 break; 3732 case TGSI_SEMANTIC_VIEWPORT_INDEX: 3733 /* spi_sid is 0 for outputs that are 3734 * not consumed by PS */ 3735 if (shader->output[i].spi_sid) { 3736 output[j].array_base = next_param_base++; 3737 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 3738 j++; 3739 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 3740 } 3741 output[j].array_base = 61; 3742 output[j].swizzle_x = 7; 3743 output[j].swizzle_y = 7; 3744 output[j].swizzle_z = 7; 3745 output[j].swizzle_w = 0; 3746 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 3747 pos_emitted = true; 3748 break; 3749 case TGSI_SEMANTIC_CLIPVERTEX: 3750 j--; 3751 break; 3752 case TGSI_SEMANTIC_CLIPDIST: 3753 output[j].array_base = next_clip_base++; 3754 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 3755 pos_emitted = true; 3756 /* spi_sid is 0 for clipdistance outputs that were generated 3757 * for clipvertex - we don't need to pass them to PS */ 3758 if (shader->output[i].spi_sid) { 3759 j++; 3760 /* duplicate it as PARAM to pass to the pixel shader */ 3761 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output)); 3762 output[j].array_base = next_param_base++; 3763 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 3764 } 3765 break; 3766 case TGSI_SEMANTIC_FOG: 3767 output[j].swizzle_y = 4; /* 0 */ 3768 output[j].swizzle_z = 4; /* 0 */ 3769 output[j].swizzle_w = 5; /* 1 */ 3770 break; 3771 case TGSI_SEMANTIC_PRIMID: 3772 output[j].swizzle_x = 2; 3773 output[j].swizzle_y = 4; /* 0 */ 3774 output[j].swizzle_z = 4; /* 0 */ 3775 output[j].swizzle_w = 4; /* 0 */ 3776 break; 3777 } 3778 3779 break; 3780 case PIPE_SHADER_FRAGMENT: 3781 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { 3782 /* never export more colors than the number of CBs */ 3783 if (shader->output[i].sid >= max_color_exports) { 3784 /* skip export */ 3785 j--; 3786 continue; 3787 } 3788 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 3789 output[j].array_base = shader->output[i].sid; 3790 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 3791 shader->nr_ps_color_exports++; 3792 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) { 3793 for (k = 1; k < max_color_exports; k++) { 3794 j++; 3795 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 3796 output[j].gpr = shader->output[i].gpr; 3797 output[j].elem_size = 3; 3798 output[j].swizzle_x = 0; 3799 output[j].swizzle_y = 1; 3800 output[j].swizzle_z = 2; 3801 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; 3802 output[j].burst_count = 1; 3803 output[j].array_base = k; 3804 output[j].op = CF_OP_EXPORT; 3805 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 3806 shader->nr_ps_color_exports++; 3807 } 3808 } 3809 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { 3810 output[j].array_base = 61; 3811 output[j].swizzle_x = 2; 3812 output[j].swizzle_y = 7; 3813 output[j].swizzle_z = output[j].swizzle_w = 7; 3814 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 3815 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) { 3816 output[j].array_base = 61; 3817 output[j].swizzle_x = 7; 3818 output[j].swizzle_y = 1; 3819 output[j].swizzle_z = output[j].swizzle_w = 7; 3820 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 3821 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) { 3822 output[j].array_base = 61; 3823 output[j].swizzle_x = 7; 3824 output[j].swizzle_y = 7; 3825 output[j].swizzle_z = 0; 3826 output[j].swizzle_w = 7; 3827 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 3828 } else { 3829 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); 3830 r = -EINVAL; 3831 goto out_err; 3832 } 3833 break; 3834 case PIPE_SHADER_TESS_CTRL: 3835 break; 3836 default: 3837 R600_ERR("unsupported processor type %d\n", ctx.type); 3838 r = -EINVAL; 3839 goto out_err; 3840 } 3841 3842 if (output[j].type == 0xffffffff) { 3843 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 3844 output[j].array_base = next_param_base++; 3845 } 3846 } 3847 3848 /* add fake position export */ 3849 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) { 3850 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 3851 output[j].gpr = 0; 3852 output[j].elem_size = 3; 3853 output[j].swizzle_x = 7; 3854 output[j].swizzle_y = 7; 3855 output[j].swizzle_z = 7; 3856 output[j].swizzle_w = 7; 3857 output[j].burst_count = 1; 3858 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS; 3859 output[j].array_base = 60; 3860 output[j].op = CF_OP_EXPORT; 3861 j++; 3862 } 3863 3864 /* add fake param output for vertex shader if no param is exported */ 3865 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) { 3866 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 3867 output[j].gpr = 0; 3868 output[j].elem_size = 3; 3869 output[j].swizzle_x = 7; 3870 output[j].swizzle_y = 7; 3871 output[j].swizzle_z = 7; 3872 output[j].swizzle_w = 7; 3873 output[j].burst_count = 1; 3874 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; 3875 output[j].array_base = 0; 3876 output[j].op = CF_OP_EXPORT; 3877 j++; 3878 } 3879 3880 /* add fake pixel export */ 3881 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) { 3882 memset(&output[j], 0, sizeof(struct r600_bytecode_output)); 3883 output[j].gpr = 0; 3884 output[j].elem_size = 3; 3885 output[j].swizzle_x = 7; 3886 output[j].swizzle_y = 7; 3887 output[j].swizzle_z = 7; 3888 output[j].swizzle_w = 7; 3889 output[j].burst_count = 1; 3890 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; 3891 output[j].array_base = 0; 3892 output[j].op = CF_OP_EXPORT; 3893 j++; 3894 shader->nr_ps_color_exports++; 3895 } 3896 3897 noutput = j; 3898 3899 /* set export done on last export of each type */ 3900 for (k = noutput - 1, output_done = 0; k >= 0; k--) { 3901 if (!(output_done & (1 << output[k].type))) { 3902 output_done |= (1 << output[k].type); 3903 output[k].op = CF_OP_EXPORT_DONE; 3904 } 3905 } 3906 /* add output to bytecode */ 3907 for (i = 0; i < noutput; i++) { 3908 r = r600_bytecode_add_output(ctx.bc, &output[i]); 3909 if (r) 3910 goto out_err; 3911 } 3912 } 3913 3914 /* add program end */ 3915 if (ctx.bc->chip_class == CAYMAN) 3916 cm_bytecode_add_cf_end(ctx.bc); 3917 else { 3918 const struct cf_op_info *last = NULL; 3919 3920 if (ctx.bc->cf_last) 3921 last = r600_isa_cf(ctx.bc->cf_last->op); 3922 3923 /* alu clause instructions don't have EOP bit, so add NOP */ 3924 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP) 3925 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); 3926 3927 ctx.bc->cf_last->end_of_program = 1; 3928 } 3929 3930 /* check GPR limit - we have 124 = 128 - 4 3931 * (4 are reserved as alu clause temporary registers) */ 3932 if (ctx.bc->ngpr > 124) { 3933 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr); 3934 r = -ENOMEM; 3935 goto out_err; 3936 } 3937 3938 if (ctx.type == PIPE_SHADER_GEOMETRY) { 3939 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so))) 3940 return r; 3941 } 3942 3943 free(ctx.literals); 3944 tgsi_parse_free(&ctx.parse); 3945 return 0; 3946 out_err: 3947 free(ctx.literals); 3948 tgsi_parse_free(&ctx.parse); 3949 return r; 3950 } 3951 3952 static int tgsi_unsupported(struct r600_shader_ctx *ctx) 3953 { 3954 const unsigned tgsi_opcode = 3955 ctx->parse.FullToken.FullInstruction.Instruction.Opcode; 3956 R600_ERR("%s tgsi opcode unsupported\n", 3957 tgsi_get_opcode_name(tgsi_opcode)); 3958 return -EINVAL; 3959 } 3960 3961 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED) 3962 { 3963 return 0; 3964 } 3965 3966 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, 3967 const struct r600_shader_src *shader_src, 3968 unsigned chan) 3969 { 3970 bc_src->sel = shader_src->sel; 3971 bc_src->chan = shader_src->swizzle[chan]; 3972 bc_src->neg = shader_src->neg; 3973 bc_src->abs = shader_src->abs; 3974 bc_src->rel = shader_src->rel; 3975 bc_src->value = shader_src->value[bc_src->chan]; 3976 bc_src->kc_bank = shader_src->kc_bank; 3977 bc_src->kc_rel = shader_src->kc_rel; 3978 } 3979 3980 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src) 3981 { 3982 bc_src->abs = 1; 3983 bc_src->neg = 0; 3984 } 3985 3986 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src) 3987 { 3988 bc_src->neg = !bc_src->neg; 3989 } 3990 3991 static void tgsi_dst(struct r600_shader_ctx *ctx, 3992 const struct tgsi_full_dst_register *tgsi_dst, 3993 unsigned swizzle, 3994 struct r600_bytecode_alu_dst *r600_dst) 3995 { 3996 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 3997 3998 r600_dst->sel = tgsi_dst->Register.Index; 3999 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; 4000 r600_dst->chan = swizzle; 4001 r600_dst->write = 1; 4002 if (inst->Instruction.Saturate) { 4003 r600_dst->clamp = 1; 4004 } 4005 if (ctx->type == PIPE_SHADER_TESS_CTRL) { 4006 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) { 4007 return; 4008 } 4009 } 4010 if (tgsi_dst->Register.Indirect) 4011 r600_dst->rel = V_SQ_REL_RELATIVE; 4012 4013 } 4014 4015 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override) 4016 { 4017 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4018 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4019 struct r600_bytecode_alu alu; 4020 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4021 int use_tmp = 0; 4022 int swizzle_x = inst->Src[0].Register.SwizzleX; 4023 4024 if (singledest) { 4025 switch (write_mask) { 4026 case 0x1: 4027 if (swizzle_x == 2) { 4028 write_mask = 0xc; 4029 use_tmp = 3; 4030 } else 4031 write_mask = 0x3; 4032 break; 4033 case 0x2: 4034 if (swizzle_x == 2) { 4035 write_mask = 0xc; 4036 use_tmp = 3; 4037 } else { 4038 write_mask = 0x3; 4039 use_tmp = 1; 4040 } 4041 break; 4042 case 0x4: 4043 if (swizzle_x == 0) { 4044 write_mask = 0x3; 4045 use_tmp = 1; 4046 } else 4047 write_mask = 0xc; 4048 break; 4049 case 0x8: 4050 if (swizzle_x == 0) { 4051 write_mask = 0x3; 4052 use_tmp = 1; 4053 } else { 4054 write_mask = 0xc; 4055 use_tmp = 3; 4056 } 4057 break; 4058 } 4059 } 4060 4061 lasti = tgsi_last_instruction(write_mask); 4062 for (i = 0; i <= lasti; i++) { 4063 4064 if (!(write_mask & (1 << i))) 4065 continue; 4066 4067 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4068 4069 if (singledest) { 4070 if (use_tmp || dest_temp) { 4071 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp; 4072 alu.dst.chan = i; 4073 alu.dst.write = 1; 4074 } else { 4075 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4076 } 4077 if (i == 1 || i == 3) 4078 alu.dst.write = 0; 4079 } else 4080 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4081 4082 alu.op = op_override ? op_override : ctx->inst_info->op; 4083 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) { 4084 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4085 } else if (!swap) { 4086 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4087 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4088 } 4089 } else { 4090 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i)); 4091 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i)); 4092 } 4093 4094 /* handle some special cases */ 4095 if (i == 1 || i == 3) { 4096 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) { 4097 case TGSI_OPCODE_DABS: 4098 r600_bytecode_src_set_abs(&alu.src[0]); 4099 break; 4100 default: 4101 break; 4102 } 4103 } 4104 if (i == lasti) { 4105 alu.last = 1; 4106 } 4107 r = r600_bytecode_add_alu(ctx->bc, &alu); 4108 if (r) 4109 return r; 4110 } 4111 4112 if (use_tmp) { 4113 write_mask = inst->Dst[0].Register.WriteMask; 4114 4115 lasti = tgsi_last_instruction(write_mask); 4116 /* move result from temp to dst */ 4117 for (i = 0; i <= lasti; i++) { 4118 if (!(write_mask & (1 << i))) 4119 continue; 4120 4121 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4122 alu.op = ALU_OP1_MOV; 4123 4124 if (dest_temp) { 4125 alu.dst.sel = dest_temp; 4126 alu.dst.chan = i; 4127 alu.dst.write = 1; 4128 } else 4129 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4130 alu.src[0].sel = ctx->temp_reg; 4131 alu.src[0].chan = use_tmp - 1; 4132 alu.last = (i == lasti); 4133 4134 r = r600_bytecode_add_alu(ctx->bc, &alu); 4135 if (r) 4136 return r; 4137 } 4138 } 4139 return 0; 4140 } 4141 4142 static int tgsi_op2_64(struct r600_shader_ctx *ctx) 4143 { 4144 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4145 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4146 /* confirm writemasking */ 4147 if ((write_mask & 0x3) != 0x3 && 4148 (write_mask & 0xc) != 0xc) { 4149 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask); 4150 return -1; 4151 } 4152 return tgsi_op2_64_params(ctx, false, false, 0, 0); 4153 } 4154 4155 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx) 4156 { 4157 return tgsi_op2_64_params(ctx, true, false, 0, 0); 4158 } 4159 4160 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx) 4161 { 4162 return tgsi_op2_64_params(ctx, true, true, 0, 0); 4163 } 4164 4165 static int tgsi_op3_64(struct r600_shader_ctx *ctx) 4166 { 4167 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4168 struct r600_bytecode_alu alu; 4169 int i, j, r; 4170 int lasti = 3; 4171 int tmp = r600_get_temp(ctx); 4172 4173 for (i = 0; i < lasti + 1; i++) { 4174 4175 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4176 alu.op = ctx->inst_info->op; 4177 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4178 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1); 4179 } 4180 4181 if (inst->Dst[0].Register.WriteMask & (1 << i)) 4182 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4183 else 4184 alu.dst.sel = tmp; 4185 4186 alu.dst.chan = i; 4187 alu.is_op3 = 1; 4188 if (i == lasti) { 4189 alu.last = 1; 4190 } 4191 r = r600_bytecode_add_alu(ctx->bc, &alu); 4192 if (r) 4193 return r; 4194 } 4195 return 0; 4196 } 4197 4198 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) 4199 { 4200 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4201 struct r600_bytecode_alu alu; 4202 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4203 int i, j, r, lasti = tgsi_last_instruction(write_mask); 4204 /* use temp register if trans_only and more than one dst component */ 4205 int use_tmp = trans_only && (write_mask ^ (1 << lasti)); 4206 unsigned op = ctx->inst_info->op; 4207 4208 if (op == ALU_OP2_MUL_IEEE && 4209 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 4210 op = ALU_OP2_MUL; 4211 4212 for (i = 0; i <= lasti; i++) { 4213 if (!(write_mask & (1 << i))) 4214 continue; 4215 4216 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4217 if (use_tmp) { 4218 alu.dst.sel = ctx->temp_reg; 4219 alu.dst.chan = i; 4220 alu.dst.write = 1; 4221 } else 4222 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4223 4224 alu.op = op; 4225 if (!swap) { 4226 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4227 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 4228 } 4229 } else { 4230 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 4231 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4232 } 4233 if (i == lasti || trans_only) { 4234 alu.last = 1; 4235 } 4236 r = r600_bytecode_add_alu(ctx->bc, &alu); 4237 if (r) 4238 return r; 4239 } 4240 4241 if (use_tmp) { 4242 /* move result from temp to dst */ 4243 for (i = 0; i <= lasti; i++) { 4244 if (!(write_mask & (1 << i))) 4245 continue; 4246 4247 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4248 alu.op = ALU_OP1_MOV; 4249 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4250 alu.src[0].sel = ctx->temp_reg; 4251 alu.src[0].chan = i; 4252 alu.last = (i == lasti); 4253 4254 r = r600_bytecode_add_alu(ctx->bc, &alu); 4255 if (r) 4256 return r; 4257 } 4258 } 4259 return 0; 4260 } 4261 4262 static int tgsi_op2(struct r600_shader_ctx *ctx) 4263 { 4264 return tgsi_op2_s(ctx, 0, 0); 4265 } 4266 4267 static int tgsi_op2_swap(struct r600_shader_ctx *ctx) 4268 { 4269 return tgsi_op2_s(ctx, 1, 0); 4270 } 4271 4272 static int tgsi_op2_trans(struct r600_shader_ctx *ctx) 4273 { 4274 return tgsi_op2_s(ctx, 0, 1); 4275 } 4276 4277 static int tgsi_ineg(struct r600_shader_ctx *ctx) 4278 { 4279 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4280 struct r600_bytecode_alu alu; 4281 int i, r; 4282 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4283 4284 for (i = 0; i < lasti + 1; i++) { 4285 4286 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4287 continue; 4288 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4289 alu.op = ctx->inst_info->op; 4290 4291 alu.src[0].sel = V_SQ_ALU_SRC_0; 4292 4293 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4294 4295 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4296 4297 if (i == lasti) { 4298 alu.last = 1; 4299 } 4300 r = r600_bytecode_add_alu(ctx->bc, &alu); 4301 if (r) 4302 return r; 4303 } 4304 return 0; 4305 4306 } 4307 4308 static int tgsi_dneg(struct r600_shader_ctx *ctx) 4309 { 4310 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4311 struct r600_bytecode_alu alu; 4312 int i, r; 4313 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4314 4315 for (i = 0; i < lasti + 1; i++) { 4316 4317 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4318 continue; 4319 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4320 alu.op = ALU_OP1_MOV; 4321 4322 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4323 4324 if (i == 1 || i == 3) 4325 r600_bytecode_src_toggle_neg(&alu.src[0]); 4326 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4327 4328 if (i == lasti) { 4329 alu.last = 1; 4330 } 4331 r = r600_bytecode_add_alu(ctx->bc, &alu); 4332 if (r) 4333 return r; 4334 } 4335 return 0; 4336 4337 } 4338 4339 static int tgsi_dfracexp(struct r600_shader_ctx *ctx) 4340 { 4341 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4342 struct r600_bytecode_alu alu; 4343 unsigned write_mask = inst->Dst[0].Register.WriteMask; 4344 int i, j, r; 4345 4346 for (i = 0; i <= 3; i++) { 4347 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4348 alu.op = ctx->inst_info->op; 4349 4350 alu.dst.sel = ctx->temp_reg; 4351 alu.dst.chan = i; 4352 alu.dst.write = 1; 4353 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4354 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); 4355 } 4356 4357 if (i == 3) 4358 alu.last = 1; 4359 4360 r = r600_bytecode_add_alu(ctx->bc, &alu); 4361 if (r) 4362 return r; 4363 } 4364 4365 /* Replicate significand result across channels. */ 4366 for (i = 0; i <= 3; i++) { 4367 if (!(write_mask & (1 << i))) 4368 continue; 4369 4370 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4371 alu.op = ALU_OP1_MOV; 4372 alu.src[0].chan = (i & 1) + 2; 4373 alu.src[0].sel = ctx->temp_reg; 4374 4375 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4376 alu.dst.write = 1; 4377 alu.last = 1; 4378 r = r600_bytecode_add_alu(ctx->bc, &alu); 4379 if (r) 4380 return r; 4381 } 4382 4383 for (i = 0; i <= 3; i++) { 4384 if (inst->Dst[1].Register.WriteMask & (1 << i)) { 4385 /* MOV third channels to writemask dst1 */ 4386 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4387 alu.op = ALU_OP1_MOV; 4388 alu.src[0].chan = 1; 4389 alu.src[0].sel = ctx->temp_reg; 4390 4391 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst); 4392 alu.last = 1; 4393 r = r600_bytecode_add_alu(ctx->bc, &alu); 4394 if (r) 4395 return r; 4396 break; 4397 } 4398 } 4399 return 0; 4400 } 4401 4402 4403 static int egcm_int_to_double(struct r600_shader_ctx *ctx) 4404 { 4405 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4406 struct r600_bytecode_alu alu; 4407 int i, r; 4408 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4409 4410 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D || 4411 inst->Instruction.Opcode == TGSI_OPCODE_U2D); 4412 4413 for (i = 0; i <= (lasti+1)/2; i++) { 4414 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4415 alu.op = ctx->inst_info->op; 4416 4417 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 4418 alu.dst.sel = ctx->temp_reg; 4419 alu.dst.chan = i; 4420 alu.dst.write = 1; 4421 alu.last = 1; 4422 4423 r = r600_bytecode_add_alu(ctx->bc, &alu); 4424 if (r) 4425 return r; 4426 } 4427 4428 for (i = 0; i <= lasti; i++) { 4429 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4430 alu.op = ALU_OP1_FLT32_TO_FLT64; 4431 4432 alu.src[0].chan = i/2; 4433 if (i%2 == 0) 4434 alu.src[0].sel = ctx->temp_reg; 4435 else { 4436 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 4437 alu.src[0].value = 0x0; 4438 } 4439 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4440 alu.last = i == lasti; 4441 4442 r = r600_bytecode_add_alu(ctx->bc, &alu); 4443 if (r) 4444 return r; 4445 } 4446 4447 return 0; 4448 } 4449 4450 static int egcm_double_to_int(struct r600_shader_ctx *ctx) 4451 { 4452 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4453 struct r600_bytecode_alu alu; 4454 int i, r; 4455 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4456 int treg = r600_get_temp(ctx); 4457 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I || 4458 inst->Instruction.Opcode == TGSI_OPCODE_D2U); 4459 4460 /* do a 64->32 into a temp register */ 4461 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32); 4462 if (r) 4463 return r; 4464 4465 for (i = 0; i <= lasti; i++) { 4466 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4467 continue; 4468 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4469 alu.op = ctx->inst_info->op; 4470 4471 alu.src[0].chan = i; 4472 alu.src[0].sel = treg; 4473 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4474 alu.last = (i == lasti); 4475 4476 r = r600_bytecode_add_alu(ctx->bc, &alu); 4477 if (r) 4478 return r; 4479 } 4480 4481 return 0; 4482 } 4483 4484 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc, 4485 unsigned op, 4486 int dst_reg, 4487 struct r600_shader_src *src, 4488 bool abs) 4489 { 4490 struct r600_bytecode_alu alu; 4491 const int last_slot = 3; 4492 int r; 4493 4494 /* these have to write the result to X/Y by the looks of it */ 4495 for (int i = 0 ; i < last_slot; i++) { 4496 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4497 alu.op = op; 4498 4499 r600_bytecode_src(&alu.src[0], src, 1); 4500 r600_bytecode_src(&alu.src[1], src, 0); 4501 4502 if (abs) 4503 r600_bytecode_src_set_abs(&alu.src[1]); 4504 4505 alu.dst.sel = dst_reg; 4506 alu.dst.chan = i; 4507 alu.dst.write = (i == 0 || i == 1); 4508 4509 if (bc->chip_class != CAYMAN || i == last_slot - 1) 4510 alu.last = 1; 4511 r = r600_bytecode_add_alu(bc, &alu); 4512 if (r) 4513 return r; 4514 } 4515 4516 return 0; 4517 } 4518 4519 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx) 4520 { 4521 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4522 int i, r; 4523 struct r600_bytecode_alu alu; 4524 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4525 int t1 = ctx->temp_reg; 4526 4527 /* should only be one src regs */ 4528 assert(inst->Instruction.NumSrcRegs == 1); 4529 4530 /* only support one double at a time */ 4531 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 4532 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 4533 4534 r = cayman_emit_unary_double_raw( 4535 ctx->bc, ctx->inst_info->op, t1, 4536 &ctx->src[0], 4537 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ || 4538 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT); 4539 if (r) 4540 return r; 4541 4542 for (i = 0 ; i <= lasti; i++) { 4543 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4544 continue; 4545 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4546 alu.op = ALU_OP1_MOV; 4547 alu.src[0].sel = t1; 4548 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1; 4549 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4550 alu.dst.write = 1; 4551 if (i == lasti) 4552 alu.last = 1; 4553 r = r600_bytecode_add_alu(ctx->bc, &alu); 4554 if (r) 4555 return r; 4556 } 4557 return 0; 4558 } 4559 4560 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) 4561 { 4562 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4563 int i, j, r; 4564 struct r600_bytecode_alu alu; 4565 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 4566 4567 for (i = 0 ; i < last_slot; i++) { 4568 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4569 alu.op = ctx->inst_info->op; 4570 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4571 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0); 4572 4573 /* RSQ should take the absolute value of src */ 4574 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) { 4575 r600_bytecode_src_set_abs(&alu.src[j]); 4576 } 4577 } 4578 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4579 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 4580 4581 if (i == last_slot - 1) 4582 alu.last = 1; 4583 r = r600_bytecode_add_alu(ctx->bc, &alu); 4584 if (r) 4585 return r; 4586 } 4587 return 0; 4588 } 4589 4590 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) 4591 { 4592 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4593 int i, j, k, r; 4594 struct r600_bytecode_alu alu; 4595 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4596 int t1 = ctx->temp_reg; 4597 4598 for (k = 0; k <= lasti; k++) { 4599 if (!(inst->Dst[0].Register.WriteMask & (1 << k))) 4600 continue; 4601 4602 for (i = 0 ; i < 4; i++) { 4603 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4604 alu.op = ctx->inst_info->op; 4605 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4606 r600_bytecode_src(&alu.src[j], &ctx->src[j], k); 4607 } 4608 alu.dst.sel = t1; 4609 alu.dst.chan = i; 4610 alu.dst.write = (i == k); 4611 if (i == 3) 4612 alu.last = 1; 4613 r = r600_bytecode_add_alu(ctx->bc, &alu); 4614 if (r) 4615 return r; 4616 } 4617 } 4618 4619 for (i = 0 ; i <= lasti; i++) { 4620 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4621 continue; 4622 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4623 alu.op = ALU_OP1_MOV; 4624 alu.src[0].sel = t1; 4625 alu.src[0].chan = i; 4626 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4627 alu.dst.write = 1; 4628 if (i == lasti) 4629 alu.last = 1; 4630 r = r600_bytecode_add_alu(ctx->bc, &alu); 4631 if (r) 4632 return r; 4633 } 4634 4635 return 0; 4636 } 4637 4638 4639 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx) 4640 { 4641 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4642 int i, j, k, r; 4643 struct r600_bytecode_alu alu; 4644 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4645 int t1 = ctx->temp_reg; 4646 4647 /* t1 would get overwritten below if we actually tried to 4648 * multiply two pairs of doubles at a time. */ 4649 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 4650 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 4651 4652 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 4653 4654 for (i = 0; i < 4; i++) { 4655 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4656 alu.op = ctx->inst_info->op; 4657 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 4658 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1)); 4659 } 4660 alu.dst.sel = t1; 4661 alu.dst.chan = i; 4662 alu.dst.write = 1; 4663 if (i == 3) 4664 alu.last = 1; 4665 r = r600_bytecode_add_alu(ctx->bc, &alu); 4666 if (r) 4667 return r; 4668 } 4669 4670 for (i = 0; i <= lasti; i++) { 4671 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4672 continue; 4673 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4674 alu.op = ALU_OP1_MOV; 4675 alu.src[0].sel = t1; 4676 alu.src[0].chan = i; 4677 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4678 alu.dst.write = 1; 4679 if (i == lasti) 4680 alu.last = 1; 4681 r = r600_bytecode_add_alu(ctx->bc, &alu); 4682 if (r) 4683 return r; 4684 } 4685 4686 return 0; 4687 } 4688 4689 /* 4690 * Emit RECIP_64 + MUL_64 to implement division. 4691 */ 4692 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx) 4693 { 4694 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4695 int r; 4696 struct r600_bytecode_alu alu; 4697 int t1 = ctx->temp_reg; 4698 int k; 4699 4700 /* Only support one double at a time. This is the same constraint as 4701 * in DMUL lowering. */ 4702 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || 4703 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); 4704 4705 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; 4706 4707 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false); 4708 if (r) 4709 return r; 4710 4711 for (int i = 0; i < 4; i++) { 4712 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4713 alu.op = ALU_OP2_MUL_64; 4714 4715 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1)); 4716 4717 alu.src[1].sel = t1; 4718 alu.src[1].chan = (i == 3) ? 0 : 1; 4719 4720 alu.dst.sel = t1; 4721 alu.dst.chan = i; 4722 alu.dst.write = 1; 4723 if (i == 3) 4724 alu.last = 1; 4725 r = r600_bytecode_add_alu(ctx->bc, &alu); 4726 if (r) 4727 return r; 4728 } 4729 4730 for (int i = 0; i < 2; i++) { 4731 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4732 alu.op = ALU_OP1_MOV; 4733 alu.src[0].sel = t1; 4734 alu.src[0].chan = i; 4735 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst); 4736 alu.dst.write = 1; 4737 if (i == 1) 4738 alu.last = 1; 4739 r = r600_bytecode_add_alu(ctx->bc, &alu); 4740 if (r) 4741 return r; 4742 } 4743 return 0; 4744 } 4745 4746 /* 4747 * r600 - trunc to -PI..PI range 4748 * r700 - normalize by dividing by 2PI 4749 * see fdo bug 27901 4750 */ 4751 static int tgsi_setup_trig(struct r600_shader_ctx *ctx) 4752 { 4753 int r; 4754 struct r600_bytecode_alu alu; 4755 4756 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4757 alu.op = ALU_OP3_MULADD; 4758 alu.is_op3 = 1; 4759 4760 alu.dst.chan = 0; 4761 alu.dst.sel = ctx->temp_reg; 4762 alu.dst.write = 1; 4763 4764 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 4765 4766 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4767 alu.src[1].chan = 0; 4768 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI); 4769 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 4770 alu.src[2].chan = 0; 4771 alu.last = 1; 4772 r = r600_bytecode_add_alu(ctx->bc, &alu); 4773 if (r) 4774 return r; 4775 4776 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4777 alu.op = ALU_OP1_FRACT; 4778 4779 alu.dst.chan = 0; 4780 alu.dst.sel = ctx->temp_reg; 4781 alu.dst.write = 1; 4782 4783 alu.src[0].sel = ctx->temp_reg; 4784 alu.src[0].chan = 0; 4785 alu.last = 1; 4786 r = r600_bytecode_add_alu(ctx->bc, &alu); 4787 if (r) 4788 return r; 4789 4790 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4791 alu.op = ALU_OP3_MULADD; 4792 alu.is_op3 = 1; 4793 4794 alu.dst.chan = 0; 4795 alu.dst.sel = ctx->temp_reg; 4796 alu.dst.write = 1; 4797 4798 alu.src[0].sel = ctx->temp_reg; 4799 alu.src[0].chan = 0; 4800 4801 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 4802 alu.src[1].chan = 0; 4803 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 4804 alu.src[2].chan = 0; 4805 4806 if (ctx->bc->chip_class == R600) { 4807 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI); 4808 alu.src[2].value = u_bitcast_f2u(-M_PI); 4809 } else { 4810 alu.src[1].sel = V_SQ_ALU_SRC_1; 4811 alu.src[2].sel = V_SQ_ALU_SRC_0_5; 4812 alu.src[2].neg = 1; 4813 } 4814 4815 alu.last = 1; 4816 r = r600_bytecode_add_alu(ctx->bc, &alu); 4817 if (r) 4818 return r; 4819 return 0; 4820 } 4821 4822 static int cayman_trig(struct r600_shader_ctx *ctx) 4823 { 4824 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4825 struct r600_bytecode_alu alu; 4826 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 4827 int i, r; 4828 4829 r = tgsi_setup_trig(ctx); 4830 if (r) 4831 return r; 4832 4833 4834 for (i = 0; i < last_slot; i++) { 4835 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4836 alu.op = ctx->inst_info->op; 4837 alu.dst.chan = i; 4838 4839 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4840 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 4841 4842 alu.src[0].sel = ctx->temp_reg; 4843 alu.src[0].chan = 0; 4844 if (i == last_slot - 1) 4845 alu.last = 1; 4846 r = r600_bytecode_add_alu(ctx->bc, &alu); 4847 if (r) 4848 return r; 4849 } 4850 return 0; 4851 } 4852 4853 static int tgsi_trig(struct r600_shader_ctx *ctx) 4854 { 4855 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4856 struct r600_bytecode_alu alu; 4857 int i, r; 4858 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 4859 4860 r = tgsi_setup_trig(ctx); 4861 if (r) 4862 return r; 4863 4864 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4865 alu.op = ctx->inst_info->op; 4866 alu.dst.chan = 0; 4867 alu.dst.sel = ctx->temp_reg; 4868 alu.dst.write = 1; 4869 4870 alu.src[0].sel = ctx->temp_reg; 4871 alu.src[0].chan = 0; 4872 alu.last = 1; 4873 r = r600_bytecode_add_alu(ctx->bc, &alu); 4874 if (r) 4875 return r; 4876 4877 /* replicate result */ 4878 for (i = 0; i < lasti + 1; i++) { 4879 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 4880 continue; 4881 4882 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4883 alu.op = ALU_OP1_MOV; 4884 4885 alu.src[0].sel = ctx->temp_reg; 4886 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 4887 if (i == lasti) 4888 alu.last = 1; 4889 r = r600_bytecode_add_alu(ctx->bc, &alu); 4890 if (r) 4891 return r; 4892 } 4893 return 0; 4894 } 4895 4896 static int tgsi_kill(struct r600_shader_ctx *ctx) 4897 { 4898 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4899 struct r600_bytecode_alu alu; 4900 int i, r; 4901 4902 for (i = 0; i < 4; i++) { 4903 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4904 alu.op = ctx->inst_info->op; 4905 4906 alu.dst.chan = i; 4907 4908 alu.src[0].sel = V_SQ_ALU_SRC_0; 4909 4910 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) { 4911 alu.src[1].sel = V_SQ_ALU_SRC_1; 4912 alu.src[1].neg = 1; 4913 } else { 4914 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 4915 } 4916 if (i == 3) { 4917 alu.last = 1; 4918 } 4919 r = r600_bytecode_add_alu(ctx->bc, &alu); 4920 if (r) 4921 return r; 4922 } 4923 4924 /* kill must be last in ALU */ 4925 ctx->bc->force_add_cf = 1; 4926 ctx->shader->uses_kill = TRUE; 4927 return 0; 4928 } 4929 4930 static int tgsi_lit(struct r600_shader_ctx *ctx) 4931 { 4932 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 4933 struct r600_bytecode_alu alu; 4934 int r; 4935 4936 /* tmp.x = max(src.y, 0.0) */ 4937 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4938 alu.op = ALU_OP2_MAX; 4939 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 4940 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 4941 alu.src[1].chan = 1; 4942 4943 alu.dst.sel = ctx->temp_reg; 4944 alu.dst.chan = 0; 4945 alu.dst.write = 1; 4946 4947 alu.last = 1; 4948 r = r600_bytecode_add_alu(ctx->bc, &alu); 4949 if (r) 4950 return r; 4951 4952 if (inst->Dst[0].Register.WriteMask & (1 << 2)) 4953 { 4954 int chan; 4955 int sel; 4956 unsigned i; 4957 4958 if (ctx->bc->chip_class == CAYMAN) { 4959 for (i = 0; i < 3; i++) { 4960 /* tmp.z = log(tmp.x) */ 4961 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4962 alu.op = ALU_OP1_LOG_CLAMPED; 4963 alu.src[0].sel = ctx->temp_reg; 4964 alu.src[0].chan = 0; 4965 alu.dst.sel = ctx->temp_reg; 4966 alu.dst.chan = i; 4967 if (i == 2) { 4968 alu.dst.write = 1; 4969 alu.last = 1; 4970 } else 4971 alu.dst.write = 0; 4972 4973 r = r600_bytecode_add_alu(ctx->bc, &alu); 4974 if (r) 4975 return r; 4976 } 4977 } else { 4978 /* tmp.z = log(tmp.x) */ 4979 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4980 alu.op = ALU_OP1_LOG_CLAMPED; 4981 alu.src[0].sel = ctx->temp_reg; 4982 alu.src[0].chan = 0; 4983 alu.dst.sel = ctx->temp_reg; 4984 alu.dst.chan = 2; 4985 alu.dst.write = 1; 4986 alu.last = 1; 4987 r = r600_bytecode_add_alu(ctx->bc, &alu); 4988 if (r) 4989 return r; 4990 } 4991 4992 chan = alu.dst.chan; 4993 sel = alu.dst.sel; 4994 4995 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */ 4996 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 4997 alu.op = ALU_OP3_MUL_LIT; 4998 alu.src[0].sel = sel; 4999 alu.src[0].chan = chan; 5000 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3); 5001 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0); 5002 alu.dst.sel = ctx->temp_reg; 5003 alu.dst.chan = 0; 5004 alu.dst.write = 1; 5005 alu.is_op3 = 1; 5006 alu.last = 1; 5007 r = r600_bytecode_add_alu(ctx->bc, &alu); 5008 if (r) 5009 return r; 5010 5011 if (ctx->bc->chip_class == CAYMAN) { 5012 for (i = 0; i < 3; i++) { 5013 /* dst.z = exp(tmp.x) */ 5014 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5015 alu.op = ALU_OP1_EXP_IEEE; 5016 alu.src[0].sel = ctx->temp_reg; 5017 alu.src[0].chan = 0; 5018 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5019 if (i == 2) { 5020 alu.dst.write = 1; 5021 alu.last = 1; 5022 } else 5023 alu.dst.write = 0; 5024 r = r600_bytecode_add_alu(ctx->bc, &alu); 5025 if (r) 5026 return r; 5027 } 5028 } else { 5029 /* dst.z = exp(tmp.x) */ 5030 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5031 alu.op = ALU_OP1_EXP_IEEE; 5032 alu.src[0].sel = ctx->temp_reg; 5033 alu.src[0].chan = 0; 5034 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 5035 alu.last = 1; 5036 r = r600_bytecode_add_alu(ctx->bc, &alu); 5037 if (r) 5038 return r; 5039 } 5040 } 5041 5042 /* dst.x, <- 1.0 */ 5043 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5044 alu.op = ALU_OP1_MOV; 5045 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ 5046 alu.src[0].chan = 0; 5047 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 5048 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1; 5049 r = r600_bytecode_add_alu(ctx->bc, &alu); 5050 if (r) 5051 return r; 5052 5053 /* dst.y = max(src.x, 0.0) */ 5054 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5055 alu.op = ALU_OP2_MAX; 5056 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5057 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ 5058 alu.src[1].chan = 0; 5059 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 5060 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1; 5061 r = r600_bytecode_add_alu(ctx->bc, &alu); 5062 if (r) 5063 return r; 5064 5065 /* dst.w, <- 1.0 */ 5066 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5067 alu.op = ALU_OP1_MOV; 5068 alu.src[0].sel = V_SQ_ALU_SRC_1; 5069 alu.src[0].chan = 0; 5070 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); 5071 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1; 5072 alu.last = 1; 5073 r = r600_bytecode_add_alu(ctx->bc, &alu); 5074 if (r) 5075 return r; 5076 5077 return 0; 5078 } 5079 5080 static int tgsi_rsq(struct r600_shader_ctx *ctx) 5081 { 5082 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5083 struct r600_bytecode_alu alu; 5084 int i, r; 5085 5086 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5087 5088 alu.op = ALU_OP1_RECIPSQRT_IEEE; 5089 5090 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5091 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5092 r600_bytecode_src_set_abs(&alu.src[i]); 5093 } 5094 alu.dst.sel = ctx->temp_reg; 5095 alu.dst.write = 1; 5096 alu.last = 1; 5097 r = r600_bytecode_add_alu(ctx->bc, &alu); 5098 if (r) 5099 return r; 5100 /* replicate result */ 5101 return tgsi_helper_tempx_replicate(ctx); 5102 } 5103 5104 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) 5105 { 5106 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5107 struct r600_bytecode_alu alu; 5108 int i, r; 5109 5110 for (i = 0; i < 4; i++) { 5111 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5112 alu.src[0].sel = ctx->temp_reg; 5113 alu.op = ALU_OP1_MOV; 5114 alu.dst.chan = i; 5115 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5116 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5117 if (i == 3) 5118 alu.last = 1; 5119 r = r600_bytecode_add_alu(ctx->bc, &alu); 5120 if (r) 5121 return r; 5122 } 5123 return 0; 5124 } 5125 5126 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) 5127 { 5128 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5129 struct r600_bytecode_alu alu; 5130 int i, r; 5131 5132 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5133 alu.op = ctx->inst_info->op; 5134 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 5135 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0); 5136 } 5137 alu.dst.sel = ctx->temp_reg; 5138 alu.dst.write = 1; 5139 alu.last = 1; 5140 r = r600_bytecode_add_alu(ctx->bc, &alu); 5141 if (r) 5142 return r; 5143 /* replicate result */ 5144 return tgsi_helper_tempx_replicate(ctx); 5145 } 5146 5147 static int cayman_pow(struct r600_shader_ctx *ctx) 5148 { 5149 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5150 int i, r; 5151 struct r600_bytecode_alu alu; 5152 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3; 5153 5154 for (i = 0; i < 3; i++) { 5155 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5156 alu.op = ALU_OP1_LOG_IEEE; 5157 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5158 alu.dst.sel = ctx->temp_reg; 5159 alu.dst.chan = i; 5160 alu.dst.write = 1; 5161 if (i == 2) 5162 alu.last = 1; 5163 r = r600_bytecode_add_alu(ctx->bc, &alu); 5164 if (r) 5165 return r; 5166 } 5167 5168 /* b * LOG2(a) */ 5169 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5170 alu.op = ALU_OP2_MUL; 5171 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5172 alu.src[1].sel = ctx->temp_reg; 5173 alu.dst.sel = ctx->temp_reg; 5174 alu.dst.write = 1; 5175 alu.last = 1; 5176 r = r600_bytecode_add_alu(ctx->bc, &alu); 5177 if (r) 5178 return r; 5179 5180 for (i = 0; i < last_slot; i++) { 5181 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5182 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5183 alu.op = ALU_OP1_EXP_IEEE; 5184 alu.src[0].sel = ctx->temp_reg; 5185 5186 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5187 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 5188 if (i == last_slot - 1) 5189 alu.last = 1; 5190 r = r600_bytecode_add_alu(ctx->bc, &alu); 5191 if (r) 5192 return r; 5193 } 5194 return 0; 5195 } 5196 5197 static int tgsi_pow(struct r600_shader_ctx *ctx) 5198 { 5199 struct r600_bytecode_alu alu; 5200 int r; 5201 5202 /* LOG2(a) */ 5203 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5204 alu.op = ALU_OP1_LOG_IEEE; 5205 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 5206 alu.dst.sel = ctx->temp_reg; 5207 alu.dst.write = 1; 5208 alu.last = 1; 5209 r = r600_bytecode_add_alu(ctx->bc, &alu); 5210 if (r) 5211 return r; 5212 /* b * LOG2(a) */ 5213 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5214 alu.op = ALU_OP2_MUL; 5215 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 5216 alu.src[1].sel = ctx->temp_reg; 5217 alu.dst.sel = ctx->temp_reg; 5218 alu.dst.write = 1; 5219 alu.last = 1; 5220 r = r600_bytecode_add_alu(ctx->bc, &alu); 5221 if (r) 5222 return r; 5223 /* POW(a,b) = EXP2(b * LOG2(a))*/ 5224 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5225 alu.op = ALU_OP1_EXP_IEEE; 5226 alu.src[0].sel = ctx->temp_reg; 5227 alu.dst.sel = ctx->temp_reg; 5228 alu.dst.write = 1; 5229 alu.last = 1; 5230 r = r600_bytecode_add_alu(ctx->bc, &alu); 5231 if (r) 5232 return r; 5233 return tgsi_helper_tempx_replicate(ctx); 5234 } 5235 5236 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op) 5237 { 5238 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 5239 struct r600_bytecode_alu alu; 5240 int i, r, j; 5241 unsigned write_mask = inst->Dst[0].Register.WriteMask; 5242 int tmp0 = ctx->temp_reg; 5243 int tmp1 = r600_get_temp(ctx); 5244 int tmp2 = r600_get_temp(ctx); 5245 int tmp3 = r600_get_temp(ctx); 5246 /* Unsigned path: 5247 * 5248 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder 5249 * 5250 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error 5251 * 2. tmp0.z = lo (tmp0.x * src2) 5252 * 3. tmp0.w = -tmp0.z 5253 * 4. tmp0.y = hi (tmp0.x * src2) 5254 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2)) 5255 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error 5256 * 7. tmp1.x = tmp0.x - tmp0.w 5257 * 8. tmp1.y = tmp0.x + tmp0.w 5258 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) 5259 * 10. tmp0.z = hi(tmp0.x * src1) = q 5260 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r 5261 * 5262 * 12. tmp0.w = src1 - tmp0.y = r 5263 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison) 5264 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison) 5265 * 5266 * if DIV 5267 * 5268 * 15. tmp1.z = tmp0.z + 1 = q + 1 5269 * 16. tmp1.w = tmp0.z - 1 = q - 1 5270 * 5271 * else MOD 5272 * 5273 * 15. tmp1.z = tmp0.w - src2 = r - src2 5274 * 16. tmp1.w = tmp0.w + src2 = r + src2 5275 * 5276 * endif 5277 * 5278 * 17. tmp1.x = tmp1.x & tmp1.y 5279 * 5280 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z 5281 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z 5282 * 5283 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z 5284 * 20. dst = src2==0 ? MAX_UINT : tmp0.z 5285 * 5286 * Signed path: 5287 * 5288 * Same as unsigned, using abs values of the operands, 5289 * and fixing the sign of the result in the end. 5290 */ 5291 5292 for (i = 0; i < 4; i++) { 5293 if (!(write_mask & (1<<i))) 5294 continue; 5295 5296 if (signed_op) { 5297 5298 /* tmp2.x = -src0 */ 5299 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5300 alu.op = ALU_OP2_SUB_INT; 5301 5302 alu.dst.sel = tmp2; 5303 alu.dst.chan = 0; 5304 alu.dst.write = 1; 5305 5306 alu.src[0].sel = V_SQ_ALU_SRC_0; 5307 5308 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5309 5310 alu.last = 1; 5311 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5312 return r; 5313 5314 /* tmp2.y = -src1 */ 5315 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5316 alu.op = ALU_OP2_SUB_INT; 5317 5318 alu.dst.sel = tmp2; 5319 alu.dst.chan = 1; 5320 alu.dst.write = 1; 5321 5322 alu.src[0].sel = V_SQ_ALU_SRC_0; 5323 5324 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5325 5326 alu.last = 1; 5327 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5328 return r; 5329 5330 /* tmp2.z sign bit is set if src0 and src2 signs are different */ 5331 /* it will be a sign of the quotient */ 5332 if (!mod) { 5333 5334 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5335 alu.op = ALU_OP2_XOR_INT; 5336 5337 alu.dst.sel = tmp2; 5338 alu.dst.chan = 2; 5339 alu.dst.write = 1; 5340 5341 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5342 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5343 5344 alu.last = 1; 5345 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5346 return r; 5347 } 5348 5349 /* tmp2.x = |src0| */ 5350 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5351 alu.op = ALU_OP3_CNDGE_INT; 5352 alu.is_op3 = 1; 5353 5354 alu.dst.sel = tmp2; 5355 alu.dst.chan = 0; 5356 alu.dst.write = 1; 5357 5358 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5359 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5360 alu.src[2].sel = tmp2; 5361 alu.src[2].chan = 0; 5362 5363 alu.last = 1; 5364 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5365 return r; 5366 5367 /* tmp2.y = |src1| */ 5368 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5369 alu.op = ALU_OP3_CNDGE_INT; 5370 alu.is_op3 = 1; 5371 5372 alu.dst.sel = tmp2; 5373 alu.dst.chan = 1; 5374 alu.dst.write = 1; 5375 5376 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5377 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5378 alu.src[2].sel = tmp2; 5379 alu.src[2].chan = 1; 5380 5381 alu.last = 1; 5382 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5383 return r; 5384 5385 } 5386 5387 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */ 5388 if (ctx->bc->chip_class == CAYMAN) { 5389 /* tmp3.x = u2f(src2) */ 5390 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5391 alu.op = ALU_OP1_UINT_TO_FLT; 5392 5393 alu.dst.sel = tmp3; 5394 alu.dst.chan = 0; 5395 alu.dst.write = 1; 5396 5397 if (signed_op) { 5398 alu.src[0].sel = tmp2; 5399 alu.src[0].chan = 1; 5400 } else { 5401 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5402 } 5403 5404 alu.last = 1; 5405 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5406 return r; 5407 5408 /* tmp0.x = recip(tmp3.x) */ 5409 for (j = 0 ; j < 3; j++) { 5410 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5411 alu.op = ALU_OP1_RECIP_IEEE; 5412 5413 alu.dst.sel = tmp0; 5414 alu.dst.chan = j; 5415 alu.dst.write = (j == 0); 5416 5417 alu.src[0].sel = tmp3; 5418 alu.src[0].chan = 0; 5419 5420 if (j == 2) 5421 alu.last = 1; 5422 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5423 return r; 5424 } 5425 5426 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5427 alu.op = ALU_OP2_MUL; 5428 5429 alu.src[0].sel = tmp0; 5430 alu.src[0].chan = 0; 5431 5432 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 5433 alu.src[1].value = 0x4f800000; 5434 5435 alu.dst.sel = tmp3; 5436 alu.dst.write = 1; 5437 alu.last = 1; 5438 r = r600_bytecode_add_alu(ctx->bc, &alu); 5439 if (r) 5440 return r; 5441 5442 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5443 alu.op = ALU_OP1_FLT_TO_UINT; 5444 5445 alu.dst.sel = tmp0; 5446 alu.dst.chan = 0; 5447 alu.dst.write = 1; 5448 5449 alu.src[0].sel = tmp3; 5450 alu.src[0].chan = 0; 5451 5452 alu.last = 1; 5453 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5454 return r; 5455 5456 } else { 5457 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5458 alu.op = ALU_OP1_RECIP_UINT; 5459 5460 alu.dst.sel = tmp0; 5461 alu.dst.chan = 0; 5462 alu.dst.write = 1; 5463 5464 if (signed_op) { 5465 alu.src[0].sel = tmp2; 5466 alu.src[0].chan = 1; 5467 } else { 5468 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5469 } 5470 5471 alu.last = 1; 5472 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5473 return r; 5474 } 5475 5476 /* 2. tmp0.z = lo (tmp0.x * src2) */ 5477 if (ctx->bc->chip_class == CAYMAN) { 5478 for (j = 0 ; j < 4; j++) { 5479 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5480 alu.op = ALU_OP2_MULLO_UINT; 5481 5482 alu.dst.sel = tmp0; 5483 alu.dst.chan = j; 5484 alu.dst.write = (j == 2); 5485 5486 alu.src[0].sel = tmp0; 5487 alu.src[0].chan = 0; 5488 if (signed_op) { 5489 alu.src[1].sel = tmp2; 5490 alu.src[1].chan = 1; 5491 } else { 5492 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5493 } 5494 5495 alu.last = (j == 3); 5496 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5497 return r; 5498 } 5499 } else { 5500 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5501 alu.op = ALU_OP2_MULLO_UINT; 5502 5503 alu.dst.sel = tmp0; 5504 alu.dst.chan = 2; 5505 alu.dst.write = 1; 5506 5507 alu.src[0].sel = tmp0; 5508 alu.src[0].chan = 0; 5509 if (signed_op) { 5510 alu.src[1].sel = tmp2; 5511 alu.src[1].chan = 1; 5512 } else { 5513 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5514 } 5515 5516 alu.last = 1; 5517 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5518 return r; 5519 } 5520 5521 /* 3. tmp0.w = -tmp0.z */ 5522 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5523 alu.op = ALU_OP2_SUB_INT; 5524 5525 alu.dst.sel = tmp0; 5526 alu.dst.chan = 3; 5527 alu.dst.write = 1; 5528 5529 alu.src[0].sel = V_SQ_ALU_SRC_0; 5530 alu.src[1].sel = tmp0; 5531 alu.src[1].chan = 2; 5532 5533 alu.last = 1; 5534 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5535 return r; 5536 5537 /* 4. tmp0.y = hi (tmp0.x * src2) */ 5538 if (ctx->bc->chip_class == CAYMAN) { 5539 for (j = 0 ; j < 4; j++) { 5540 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5541 alu.op = ALU_OP2_MULHI_UINT; 5542 5543 alu.dst.sel = tmp0; 5544 alu.dst.chan = j; 5545 alu.dst.write = (j == 1); 5546 5547 alu.src[0].sel = tmp0; 5548 alu.src[0].chan = 0; 5549 5550 if (signed_op) { 5551 alu.src[1].sel = tmp2; 5552 alu.src[1].chan = 1; 5553 } else { 5554 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5555 } 5556 alu.last = (j == 3); 5557 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5558 return r; 5559 } 5560 } else { 5561 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5562 alu.op = ALU_OP2_MULHI_UINT; 5563 5564 alu.dst.sel = tmp0; 5565 alu.dst.chan = 1; 5566 alu.dst.write = 1; 5567 5568 alu.src[0].sel = tmp0; 5569 alu.src[0].chan = 0; 5570 5571 if (signed_op) { 5572 alu.src[1].sel = tmp2; 5573 alu.src[1].chan = 1; 5574 } else { 5575 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5576 } 5577 5578 alu.last = 1; 5579 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5580 return r; 5581 } 5582 5583 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */ 5584 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5585 alu.op = ALU_OP3_CNDE_INT; 5586 alu.is_op3 = 1; 5587 5588 alu.dst.sel = tmp0; 5589 alu.dst.chan = 2; 5590 alu.dst.write = 1; 5591 5592 alu.src[0].sel = tmp0; 5593 alu.src[0].chan = 1; 5594 alu.src[1].sel = tmp0; 5595 alu.src[1].chan = 3; 5596 alu.src[2].sel = tmp0; 5597 alu.src[2].chan = 2; 5598 5599 alu.last = 1; 5600 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5601 return r; 5602 5603 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */ 5604 if (ctx->bc->chip_class == CAYMAN) { 5605 for (j = 0 ; j < 4; j++) { 5606 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5607 alu.op = ALU_OP2_MULHI_UINT; 5608 5609 alu.dst.sel = tmp0; 5610 alu.dst.chan = j; 5611 alu.dst.write = (j == 3); 5612 5613 alu.src[0].sel = tmp0; 5614 alu.src[0].chan = 2; 5615 5616 alu.src[1].sel = tmp0; 5617 alu.src[1].chan = 0; 5618 5619 alu.last = (j == 3); 5620 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5621 return r; 5622 } 5623 } else { 5624 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5625 alu.op = ALU_OP2_MULHI_UINT; 5626 5627 alu.dst.sel = tmp0; 5628 alu.dst.chan = 3; 5629 alu.dst.write = 1; 5630 5631 alu.src[0].sel = tmp0; 5632 alu.src[0].chan = 2; 5633 5634 alu.src[1].sel = tmp0; 5635 alu.src[1].chan = 0; 5636 5637 alu.last = 1; 5638 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5639 return r; 5640 } 5641 5642 /* 7. tmp1.x = tmp0.x - tmp0.w */ 5643 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5644 alu.op = ALU_OP2_SUB_INT; 5645 5646 alu.dst.sel = tmp1; 5647 alu.dst.chan = 0; 5648 alu.dst.write = 1; 5649 5650 alu.src[0].sel = tmp0; 5651 alu.src[0].chan = 0; 5652 alu.src[1].sel = tmp0; 5653 alu.src[1].chan = 3; 5654 5655 alu.last = 1; 5656 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5657 return r; 5658 5659 /* 8. tmp1.y = tmp0.x + tmp0.w */ 5660 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5661 alu.op = ALU_OP2_ADD_INT; 5662 5663 alu.dst.sel = tmp1; 5664 alu.dst.chan = 1; 5665 alu.dst.write = 1; 5666 5667 alu.src[0].sel = tmp0; 5668 alu.src[0].chan = 0; 5669 alu.src[1].sel = tmp0; 5670 alu.src[1].chan = 3; 5671 5672 alu.last = 1; 5673 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5674 return r; 5675 5676 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */ 5677 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5678 alu.op = ALU_OP3_CNDE_INT; 5679 alu.is_op3 = 1; 5680 5681 alu.dst.sel = tmp0; 5682 alu.dst.chan = 0; 5683 alu.dst.write = 1; 5684 5685 alu.src[0].sel = tmp0; 5686 alu.src[0].chan = 1; 5687 alu.src[1].sel = tmp1; 5688 alu.src[1].chan = 1; 5689 alu.src[2].sel = tmp1; 5690 alu.src[2].chan = 0; 5691 5692 alu.last = 1; 5693 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5694 return r; 5695 5696 /* 10. tmp0.z = hi(tmp0.x * src1) = q */ 5697 if (ctx->bc->chip_class == CAYMAN) { 5698 for (j = 0 ; j < 4; j++) { 5699 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5700 alu.op = ALU_OP2_MULHI_UINT; 5701 5702 alu.dst.sel = tmp0; 5703 alu.dst.chan = j; 5704 alu.dst.write = (j == 2); 5705 5706 alu.src[0].sel = tmp0; 5707 alu.src[0].chan = 0; 5708 5709 if (signed_op) { 5710 alu.src[1].sel = tmp2; 5711 alu.src[1].chan = 0; 5712 } else { 5713 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5714 } 5715 5716 alu.last = (j == 3); 5717 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5718 return r; 5719 } 5720 } else { 5721 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5722 alu.op = ALU_OP2_MULHI_UINT; 5723 5724 alu.dst.sel = tmp0; 5725 alu.dst.chan = 2; 5726 alu.dst.write = 1; 5727 5728 alu.src[0].sel = tmp0; 5729 alu.src[0].chan = 0; 5730 5731 if (signed_op) { 5732 alu.src[1].sel = tmp2; 5733 alu.src[1].chan = 0; 5734 } else { 5735 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 5736 } 5737 5738 alu.last = 1; 5739 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5740 return r; 5741 } 5742 5743 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */ 5744 if (ctx->bc->chip_class == CAYMAN) { 5745 for (j = 0 ; j < 4; j++) { 5746 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5747 alu.op = ALU_OP2_MULLO_UINT; 5748 5749 alu.dst.sel = tmp0; 5750 alu.dst.chan = j; 5751 alu.dst.write = (j == 1); 5752 5753 if (signed_op) { 5754 alu.src[0].sel = tmp2; 5755 alu.src[0].chan = 1; 5756 } else { 5757 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5758 } 5759 5760 alu.src[1].sel = tmp0; 5761 alu.src[1].chan = 2; 5762 5763 alu.last = (j == 3); 5764 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5765 return r; 5766 } 5767 } else { 5768 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5769 alu.op = ALU_OP2_MULLO_UINT; 5770 5771 alu.dst.sel = tmp0; 5772 alu.dst.chan = 1; 5773 alu.dst.write = 1; 5774 5775 if (signed_op) { 5776 alu.src[0].sel = tmp2; 5777 alu.src[0].chan = 1; 5778 } else { 5779 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 5780 } 5781 5782 alu.src[1].sel = tmp0; 5783 alu.src[1].chan = 2; 5784 5785 alu.last = 1; 5786 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5787 return r; 5788 } 5789 5790 /* 12. tmp0.w = src1 - tmp0.y = r */ 5791 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5792 alu.op = ALU_OP2_SUB_INT; 5793 5794 alu.dst.sel = tmp0; 5795 alu.dst.chan = 3; 5796 alu.dst.write = 1; 5797 5798 if (signed_op) { 5799 alu.src[0].sel = tmp2; 5800 alu.src[0].chan = 0; 5801 } else { 5802 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5803 } 5804 5805 alu.src[1].sel = tmp0; 5806 alu.src[1].chan = 1; 5807 5808 alu.last = 1; 5809 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5810 return r; 5811 5812 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */ 5813 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5814 alu.op = ALU_OP2_SETGE_UINT; 5815 5816 alu.dst.sel = tmp1; 5817 alu.dst.chan = 0; 5818 alu.dst.write = 1; 5819 5820 alu.src[0].sel = tmp0; 5821 alu.src[0].chan = 3; 5822 if (signed_op) { 5823 alu.src[1].sel = tmp2; 5824 alu.src[1].chan = 1; 5825 } else { 5826 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5827 } 5828 5829 alu.last = 1; 5830 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5831 return r; 5832 5833 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */ 5834 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5835 alu.op = ALU_OP2_SETGE_UINT; 5836 5837 alu.dst.sel = tmp1; 5838 alu.dst.chan = 1; 5839 alu.dst.write = 1; 5840 5841 if (signed_op) { 5842 alu.src[0].sel = tmp2; 5843 alu.src[0].chan = 0; 5844 } else { 5845 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 5846 } 5847 5848 alu.src[1].sel = tmp0; 5849 alu.src[1].chan = 1; 5850 5851 alu.last = 1; 5852 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5853 return r; 5854 5855 if (mod) { /* UMOD */ 5856 5857 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */ 5858 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5859 alu.op = ALU_OP2_SUB_INT; 5860 5861 alu.dst.sel = tmp1; 5862 alu.dst.chan = 2; 5863 alu.dst.write = 1; 5864 5865 alu.src[0].sel = tmp0; 5866 alu.src[0].chan = 3; 5867 5868 if (signed_op) { 5869 alu.src[1].sel = tmp2; 5870 alu.src[1].chan = 1; 5871 } else { 5872 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5873 } 5874 5875 alu.last = 1; 5876 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5877 return r; 5878 5879 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */ 5880 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5881 alu.op = ALU_OP2_ADD_INT; 5882 5883 alu.dst.sel = tmp1; 5884 alu.dst.chan = 3; 5885 alu.dst.write = 1; 5886 5887 alu.src[0].sel = tmp0; 5888 alu.src[0].chan = 3; 5889 if (signed_op) { 5890 alu.src[1].sel = tmp2; 5891 alu.src[1].chan = 1; 5892 } else { 5893 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 5894 } 5895 5896 alu.last = 1; 5897 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5898 return r; 5899 5900 } else { /* UDIV */ 5901 5902 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */ 5903 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5904 alu.op = ALU_OP2_ADD_INT; 5905 5906 alu.dst.sel = tmp1; 5907 alu.dst.chan = 2; 5908 alu.dst.write = 1; 5909 5910 alu.src[0].sel = tmp0; 5911 alu.src[0].chan = 2; 5912 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 5913 5914 alu.last = 1; 5915 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5916 return r; 5917 5918 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */ 5919 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5920 alu.op = ALU_OP2_ADD_INT; 5921 5922 alu.dst.sel = tmp1; 5923 alu.dst.chan = 3; 5924 alu.dst.write = 1; 5925 5926 alu.src[0].sel = tmp0; 5927 alu.src[0].chan = 2; 5928 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT; 5929 5930 alu.last = 1; 5931 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5932 return r; 5933 5934 } 5935 5936 /* 17. tmp1.x = tmp1.x & tmp1.y */ 5937 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5938 alu.op = ALU_OP2_AND_INT; 5939 5940 alu.dst.sel = tmp1; 5941 alu.dst.chan = 0; 5942 alu.dst.write = 1; 5943 5944 alu.src[0].sel = tmp1; 5945 alu.src[0].chan = 0; 5946 alu.src[1].sel = tmp1; 5947 alu.src[1].chan = 1; 5948 5949 alu.last = 1; 5950 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5951 return r; 5952 5953 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */ 5954 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */ 5955 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5956 alu.op = ALU_OP3_CNDE_INT; 5957 alu.is_op3 = 1; 5958 5959 alu.dst.sel = tmp0; 5960 alu.dst.chan = 2; 5961 alu.dst.write = 1; 5962 5963 alu.src[0].sel = tmp1; 5964 alu.src[0].chan = 0; 5965 alu.src[1].sel = tmp0; 5966 alu.src[1].chan = mod ? 3 : 2; 5967 alu.src[2].sel = tmp1; 5968 alu.src[2].chan = 2; 5969 5970 alu.last = 1; 5971 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5972 return r; 5973 5974 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */ 5975 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 5976 alu.op = ALU_OP3_CNDE_INT; 5977 alu.is_op3 = 1; 5978 5979 if (signed_op) { 5980 alu.dst.sel = tmp0; 5981 alu.dst.chan = 2; 5982 alu.dst.write = 1; 5983 } else { 5984 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 5985 } 5986 5987 alu.src[0].sel = tmp1; 5988 alu.src[0].chan = 1; 5989 alu.src[1].sel = tmp1; 5990 alu.src[1].chan = 3; 5991 alu.src[2].sel = tmp0; 5992 alu.src[2].chan = 2; 5993 5994 alu.last = 1; 5995 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 5996 return r; 5997 5998 if (signed_op) { 5999 6000 /* fix the sign of the result */ 6001 6002 if (mod) { 6003 6004 /* tmp0.x = -tmp0.z */ 6005 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6006 alu.op = ALU_OP2_SUB_INT; 6007 6008 alu.dst.sel = tmp0; 6009 alu.dst.chan = 0; 6010 alu.dst.write = 1; 6011 6012 alu.src[0].sel = V_SQ_ALU_SRC_0; 6013 alu.src[1].sel = tmp0; 6014 alu.src[1].chan = 2; 6015 6016 alu.last = 1; 6017 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6018 return r; 6019 6020 /* sign of the remainder is the same as the sign of src0 */ 6021 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */ 6022 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6023 alu.op = ALU_OP3_CNDGE_INT; 6024 alu.is_op3 = 1; 6025 6026 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6027 6028 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6029 alu.src[1].sel = tmp0; 6030 alu.src[1].chan = 2; 6031 alu.src[2].sel = tmp0; 6032 alu.src[2].chan = 0; 6033 6034 alu.last = 1; 6035 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6036 return r; 6037 6038 } else { 6039 6040 /* tmp0.x = -tmp0.z */ 6041 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6042 alu.op = ALU_OP2_SUB_INT; 6043 6044 alu.dst.sel = tmp0; 6045 alu.dst.chan = 0; 6046 alu.dst.write = 1; 6047 6048 alu.src[0].sel = V_SQ_ALU_SRC_0; 6049 alu.src[1].sel = tmp0; 6050 alu.src[1].chan = 2; 6051 6052 alu.last = 1; 6053 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6054 return r; 6055 6056 /* fix the quotient sign (same as the sign of src0*src1) */ 6057 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */ 6058 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6059 alu.op = ALU_OP3_CNDGE_INT; 6060 alu.is_op3 = 1; 6061 6062 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6063 6064 alu.src[0].sel = tmp2; 6065 alu.src[0].chan = 2; 6066 alu.src[1].sel = tmp0; 6067 alu.src[1].chan = 2; 6068 alu.src[2].sel = tmp0; 6069 alu.src[2].chan = 0; 6070 6071 alu.last = 1; 6072 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 6073 return r; 6074 } 6075 } 6076 } 6077 return 0; 6078 } 6079 6080 static int tgsi_udiv(struct r600_shader_ctx *ctx) 6081 { 6082 return tgsi_divmod(ctx, 0, 0); 6083 } 6084 6085 static int tgsi_umod(struct r600_shader_ctx *ctx) 6086 { 6087 return tgsi_divmod(ctx, 1, 0); 6088 } 6089 6090 static int tgsi_idiv(struct r600_shader_ctx *ctx) 6091 { 6092 return tgsi_divmod(ctx, 0, 1); 6093 } 6094 6095 static int tgsi_imod(struct r600_shader_ctx *ctx) 6096 { 6097 return tgsi_divmod(ctx, 1, 1); 6098 } 6099 6100 6101 static int tgsi_f2i(struct r600_shader_ctx *ctx) 6102 { 6103 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6104 struct r600_bytecode_alu alu; 6105 int i, r; 6106 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6107 int last_inst = tgsi_last_instruction(write_mask); 6108 6109 for (i = 0; i < 4; i++) { 6110 if (!(write_mask & (1<<i))) 6111 continue; 6112 6113 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6114 alu.op = ALU_OP1_TRUNC; 6115 6116 alu.dst.sel = ctx->temp_reg; 6117 alu.dst.chan = i; 6118 alu.dst.write = 1; 6119 6120 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6121 if (i == last_inst) 6122 alu.last = 1; 6123 r = r600_bytecode_add_alu(ctx->bc, &alu); 6124 if (r) 6125 return r; 6126 } 6127 6128 for (i = 0; i < 4; i++) { 6129 if (!(write_mask & (1<<i))) 6130 continue; 6131 6132 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6133 alu.op = ctx->inst_info->op; 6134 6135 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6136 6137 alu.src[0].sel = ctx->temp_reg; 6138 alu.src[0].chan = i; 6139 6140 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT) 6141 alu.last = 1; 6142 r = r600_bytecode_add_alu(ctx->bc, &alu); 6143 if (r) 6144 return r; 6145 } 6146 6147 return 0; 6148 } 6149 6150 static int tgsi_iabs(struct r600_shader_ctx *ctx) 6151 { 6152 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6153 struct r600_bytecode_alu alu; 6154 int i, r; 6155 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6156 int last_inst = tgsi_last_instruction(write_mask); 6157 6158 /* tmp = -src */ 6159 for (i = 0; i < 4; i++) { 6160 if (!(write_mask & (1<<i))) 6161 continue; 6162 6163 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6164 alu.op = ALU_OP2_SUB_INT; 6165 6166 alu.dst.sel = ctx->temp_reg; 6167 alu.dst.chan = i; 6168 alu.dst.write = 1; 6169 6170 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6171 alu.src[0].sel = V_SQ_ALU_SRC_0; 6172 6173 if (i == last_inst) 6174 alu.last = 1; 6175 r = r600_bytecode_add_alu(ctx->bc, &alu); 6176 if (r) 6177 return r; 6178 } 6179 6180 /* dst = (src >= 0 ? src : tmp) */ 6181 for (i = 0; i < 4; i++) { 6182 if (!(write_mask & (1<<i))) 6183 continue; 6184 6185 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6186 alu.op = ALU_OP3_CNDGE_INT; 6187 alu.is_op3 = 1; 6188 alu.dst.write = 1; 6189 6190 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6191 6192 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6193 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6194 alu.src[2].sel = ctx->temp_reg; 6195 alu.src[2].chan = i; 6196 6197 if (i == last_inst) 6198 alu.last = 1; 6199 r = r600_bytecode_add_alu(ctx->bc, &alu); 6200 if (r) 6201 return r; 6202 } 6203 return 0; 6204 } 6205 6206 static int tgsi_issg(struct r600_shader_ctx *ctx) 6207 { 6208 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6209 struct r600_bytecode_alu alu; 6210 int i, r; 6211 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6212 int last_inst = tgsi_last_instruction(write_mask); 6213 6214 /* tmp = (src >= 0 ? src : -1) */ 6215 for (i = 0; i < 4; i++) { 6216 if (!(write_mask & (1<<i))) 6217 continue; 6218 6219 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6220 alu.op = ALU_OP3_CNDGE_INT; 6221 alu.is_op3 = 1; 6222 6223 alu.dst.sel = ctx->temp_reg; 6224 alu.dst.chan = i; 6225 alu.dst.write = 1; 6226 6227 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6228 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 6229 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT; 6230 6231 if (i == last_inst) 6232 alu.last = 1; 6233 r = r600_bytecode_add_alu(ctx->bc, &alu); 6234 if (r) 6235 return r; 6236 } 6237 6238 /* dst = (tmp > 0 ? 1 : tmp) */ 6239 for (i = 0; i < 4; i++) { 6240 if (!(write_mask & (1<<i))) 6241 continue; 6242 6243 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6244 alu.op = ALU_OP3_CNDGT_INT; 6245 alu.is_op3 = 1; 6246 alu.dst.write = 1; 6247 6248 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6249 6250 alu.src[0].sel = ctx->temp_reg; 6251 alu.src[0].chan = i; 6252 6253 alu.src[1].sel = V_SQ_ALU_SRC_1_INT; 6254 6255 alu.src[2].sel = ctx->temp_reg; 6256 alu.src[2].chan = i; 6257 6258 if (i == last_inst) 6259 alu.last = 1; 6260 r = r600_bytecode_add_alu(ctx->bc, &alu); 6261 if (r) 6262 return r; 6263 } 6264 return 0; 6265 } 6266 6267 6268 6269 static int tgsi_ssg(struct r600_shader_ctx *ctx) 6270 { 6271 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6272 struct r600_bytecode_alu alu; 6273 int i, r; 6274 6275 /* tmp = (src > 0 ? 1 : src) */ 6276 for (i = 0; i < 4; i++) { 6277 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6278 alu.op = ALU_OP3_CNDGT; 6279 alu.is_op3 = 1; 6280 6281 alu.dst.sel = ctx->temp_reg; 6282 alu.dst.chan = i; 6283 6284 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6285 alu.src[1].sel = V_SQ_ALU_SRC_1; 6286 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6287 6288 if (i == 3) 6289 alu.last = 1; 6290 r = r600_bytecode_add_alu(ctx->bc, &alu); 6291 if (r) 6292 return r; 6293 } 6294 6295 /* dst = (-tmp > 0 ? -1 : tmp) */ 6296 for (i = 0; i < 4; i++) { 6297 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6298 alu.op = ALU_OP3_CNDGT; 6299 alu.is_op3 = 1; 6300 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6301 6302 alu.src[0].sel = ctx->temp_reg; 6303 alu.src[0].chan = i; 6304 alu.src[0].neg = 1; 6305 6306 alu.src[1].sel = V_SQ_ALU_SRC_1; 6307 alu.src[1].neg = 1; 6308 6309 alu.src[2].sel = ctx->temp_reg; 6310 alu.src[2].chan = i; 6311 6312 if (i == 3) 6313 alu.last = 1; 6314 r = r600_bytecode_add_alu(ctx->bc, &alu); 6315 if (r) 6316 return r; 6317 } 6318 return 0; 6319 } 6320 6321 static int tgsi_bfi(struct r600_shader_ctx *ctx) 6322 { 6323 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6324 struct r600_bytecode_alu alu; 6325 int i, r, t1, t2; 6326 6327 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6328 int last_inst = tgsi_last_instruction(write_mask); 6329 6330 t1 = r600_get_temp(ctx); 6331 6332 for (i = 0; i < 4; i++) { 6333 if (!(write_mask & (1<<i))) 6334 continue; 6335 6336 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6337 alu.op = ALU_OP2_SETGE_INT; 6338 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6339 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 6340 alu.src[1].value = 32; 6341 alu.dst.sel = ctx->temp_reg; 6342 alu.dst.chan = i; 6343 alu.dst.write = 1; 6344 alu.last = i == last_inst; 6345 r = r600_bytecode_add_alu(ctx->bc, &alu); 6346 if (r) 6347 return r; 6348 } 6349 6350 for (i = 0; i < 4; i++) { 6351 if (!(write_mask & (1<<i))) 6352 continue; 6353 6354 /* create mask tmp */ 6355 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6356 alu.op = ALU_OP2_BFM_INT; 6357 alu.dst.sel = t1; 6358 alu.dst.chan = i; 6359 alu.dst.write = 1; 6360 alu.last = i == last_inst; 6361 6362 r600_bytecode_src(&alu.src[0], &ctx->src[3], i); 6363 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6364 6365 r = r600_bytecode_add_alu(ctx->bc, &alu); 6366 if (r) 6367 return r; 6368 } 6369 6370 t2 = r600_get_temp(ctx); 6371 6372 for (i = 0; i < 4; i++) { 6373 if (!(write_mask & (1<<i))) 6374 continue; 6375 6376 /* shift insert left */ 6377 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6378 alu.op = ALU_OP2_LSHL_INT; 6379 alu.dst.sel = t2; 6380 alu.dst.chan = i; 6381 alu.dst.write = 1; 6382 alu.last = i == last_inst; 6383 6384 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 6385 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 6386 6387 r = r600_bytecode_add_alu(ctx->bc, &alu); 6388 if (r) 6389 return r; 6390 } 6391 6392 for (i = 0; i < 4; i++) { 6393 if (!(write_mask & (1<<i))) 6394 continue; 6395 6396 /* actual bitfield insert */ 6397 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6398 alu.op = ALU_OP3_BFI_INT; 6399 alu.is_op3 = 1; 6400 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6401 alu.dst.chan = i; 6402 alu.dst.write = 1; 6403 alu.last = i == last_inst; 6404 6405 alu.src[0].sel = t1; 6406 alu.src[0].chan = i; 6407 alu.src[1].sel = t2; 6408 alu.src[1].chan = i; 6409 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 6410 6411 r = r600_bytecode_add_alu(ctx->bc, &alu); 6412 if (r) 6413 return r; 6414 } 6415 6416 for (i = 0; i < 4; i++) { 6417 if (!(write_mask & (1<<i))) 6418 continue; 6419 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6420 alu.op = ALU_OP3_CNDE_INT; 6421 alu.is_op3 = 1; 6422 alu.src[0].sel = ctx->temp_reg; 6423 alu.src[0].chan = i; 6424 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 6425 6426 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6427 6428 alu.src[1].sel = alu.dst.sel; 6429 alu.src[1].chan = i; 6430 6431 alu.last = i == last_inst; 6432 r = r600_bytecode_add_alu(ctx->bc, &alu); 6433 if (r) 6434 return r; 6435 } 6436 return 0; 6437 } 6438 6439 static int tgsi_msb(struct r600_shader_ctx *ctx) 6440 { 6441 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6442 struct r600_bytecode_alu alu; 6443 int i, r, t1, t2; 6444 6445 unsigned write_mask = inst->Dst[0].Register.WriteMask; 6446 int last_inst = tgsi_last_instruction(write_mask); 6447 6448 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT || 6449 ctx->inst_info->op == ALU_OP1_FFBH_UINT); 6450 6451 t1 = ctx->temp_reg; 6452 6453 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */ 6454 for (i = 0; i < 4; i++) { 6455 if (!(write_mask & (1<<i))) 6456 continue; 6457 6458 /* t1 = FFBH_INT / FFBH_UINT */ 6459 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6460 alu.op = ctx->inst_info->op; 6461 alu.dst.sel = t1; 6462 alu.dst.chan = i; 6463 alu.dst.write = 1; 6464 alu.last = i == last_inst; 6465 6466 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6467 6468 r = r600_bytecode_add_alu(ctx->bc, &alu); 6469 if (r) 6470 return r; 6471 } 6472 6473 t2 = r600_get_temp(ctx); 6474 6475 for (i = 0; i < 4; i++) { 6476 if (!(write_mask & (1<<i))) 6477 continue; 6478 6479 /* t2 = 31 - t1 */ 6480 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6481 alu.op = ALU_OP2_SUB_INT; 6482 alu.dst.sel = t2; 6483 alu.dst.chan = i; 6484 alu.dst.write = 1; 6485 alu.last = i == last_inst; 6486 6487 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 6488 alu.src[0].value = 31; 6489 alu.src[1].sel = t1; 6490 alu.src[1].chan = i; 6491 6492 r = r600_bytecode_add_alu(ctx->bc, &alu); 6493 if (r) 6494 return r; 6495 } 6496 6497 for (i = 0; i < 4; i++) { 6498 if (!(write_mask & (1<<i))) 6499 continue; 6500 6501 /* result = t1 >= 0 ? t2 : t1 */ 6502 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6503 alu.op = ALU_OP3_CNDGE_INT; 6504 alu.is_op3 = 1; 6505 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6506 alu.dst.chan = i; 6507 alu.dst.write = 1; 6508 alu.last = i == last_inst; 6509 6510 alu.src[0].sel = t1; 6511 alu.src[0].chan = i; 6512 alu.src[1].sel = t2; 6513 alu.src[1].chan = i; 6514 alu.src[2].sel = t1; 6515 alu.src[2].chan = i; 6516 6517 r = r600_bytecode_add_alu(ctx->bc, &alu); 6518 if (r) 6519 return r; 6520 } 6521 6522 return 0; 6523 } 6524 6525 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx) 6526 { 6527 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6528 struct r600_bytecode_alu alu; 6529 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti; 6530 unsigned location; 6531 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs; 6532 6533 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT); 6534 6535 /* Interpolators have been marked for use already by allocate_system_value_inputs */ 6536 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 6537 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6538 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */ 6539 } 6540 else { 6541 location = TGSI_INTERPOLATE_LOC_CENTROID; 6542 } 6543 6544 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location); 6545 if (k < 0) 6546 k = 0; 6547 interp_gpr = ctx->eg_interpolators[k].ij_index / 2; 6548 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2); 6549 6550 /* NOTE: currently offset is not perspective correct */ 6551 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 6552 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6553 int sample_gpr = -1; 6554 int gradientsH, gradientsV; 6555 struct r600_bytecode_tex tex; 6556 6557 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6558 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]); 6559 } 6560 6561 gradientsH = r600_get_temp(ctx); 6562 gradientsV = r600_get_temp(ctx); 6563 for (i = 0; i < 2; i++) { 6564 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 6565 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V; 6566 tex.src_gpr = interp_gpr; 6567 tex.src_sel_x = interp_base_chan + 0; 6568 tex.src_sel_y = interp_base_chan + 1; 6569 tex.src_sel_z = 0; 6570 tex.src_sel_w = 0; 6571 tex.dst_gpr = i == 0 ? gradientsH : gradientsV; 6572 tex.dst_sel_x = 0; 6573 tex.dst_sel_y = 1; 6574 tex.dst_sel_z = 7; 6575 tex.dst_sel_w = 7; 6576 tex.inst_mod = 1; // Use per pixel gradient calculation 6577 tex.sampler_id = 0; 6578 tex.resource_id = tex.sampler_id; 6579 r = r600_bytecode_add_tex(ctx->bc, &tex); 6580 if (r) 6581 return r; 6582 } 6583 6584 for (i = 0; i < 2; i++) { 6585 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6586 alu.op = ALU_OP3_MULADD; 6587 alu.is_op3 = 1; 6588 alu.src[0].sel = gradientsH; 6589 alu.src[0].chan = i; 6590 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6591 alu.src[1].sel = sample_gpr; 6592 alu.src[1].chan = 2; 6593 } 6594 else { 6595 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0); 6596 } 6597 alu.src[2].sel = interp_gpr; 6598 alu.src[2].chan = interp_base_chan + i; 6599 alu.dst.sel = ctx->temp_reg; 6600 alu.dst.chan = i; 6601 alu.last = i == 1; 6602 6603 r = r600_bytecode_add_alu(ctx->bc, &alu); 6604 if (r) 6605 return r; 6606 } 6607 6608 for (i = 0; i < 2; i++) { 6609 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6610 alu.op = ALU_OP3_MULADD; 6611 alu.is_op3 = 1; 6612 alu.src[0].sel = gradientsV; 6613 alu.src[0].chan = i; 6614 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6615 alu.src[1].sel = sample_gpr; 6616 alu.src[1].chan = 3; 6617 } 6618 else { 6619 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1); 6620 } 6621 alu.src[2].sel = ctx->temp_reg; 6622 alu.src[2].chan = i; 6623 alu.dst.sel = ctx->temp_reg; 6624 alu.dst.chan = i; 6625 alu.last = i == 1; 6626 6627 r = r600_bytecode_add_alu(ctx->bc, &alu); 6628 if (r) 6629 return r; 6630 } 6631 } 6632 6633 tmp = r600_get_temp(ctx); 6634 for (i = 0; i < 8; i++) { 6635 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6636 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY; 6637 6638 alu.dst.sel = tmp; 6639 if ((i > 1 && i < 6)) { 6640 alu.dst.write = 1; 6641 } 6642 else { 6643 alu.dst.write = 0; 6644 } 6645 alu.dst.chan = i % 4; 6646 6647 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET || 6648 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) { 6649 alu.src[0].sel = ctx->temp_reg; 6650 alu.src[0].chan = 1 - (i % 2); 6651 } else { 6652 alu.src[0].sel = interp_gpr; 6653 alu.src[0].chan = interp_base_chan + 1 - (i % 2); 6654 } 6655 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos; 6656 alu.src[1].chan = 0; 6657 6658 alu.last = i % 4 == 3; 6659 alu.bank_swizzle_force = SQ_ALU_VEC_210; 6660 6661 r = r600_bytecode_add_alu(ctx->bc, &alu); 6662 if (r) 6663 return r; 6664 } 6665 6666 // INTERP can't swizzle dst 6667 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 6668 for (i = 0; i <= lasti; i++) { 6669 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 6670 continue; 6671 6672 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6673 alu.op = ALU_OP1_MOV; 6674 alu.src[0].sel = tmp; 6675 alu.src[0].chan = ctx->src[0].swizzle[i]; 6676 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6677 alu.dst.write = 1; 6678 alu.last = i == lasti; 6679 r = r600_bytecode_add_alu(ctx->bc, &alu); 6680 if (r) 6681 return r; 6682 } 6683 6684 return 0; 6685 } 6686 6687 6688 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst) 6689 { 6690 struct r600_bytecode_alu alu; 6691 int i, r; 6692 6693 for (i = 0; i < 4; i++) { 6694 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6695 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { 6696 alu.op = ALU_OP0_NOP; 6697 alu.dst.chan = i; 6698 } else { 6699 alu.op = ALU_OP1_MOV; 6700 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6701 alu.src[0].sel = ctx->temp_reg; 6702 alu.src[0].chan = i; 6703 } 6704 if (i == 3) { 6705 alu.last = 1; 6706 } 6707 r = r600_bytecode_add_alu(ctx->bc, &alu); 6708 if (r) 6709 return r; 6710 } 6711 return 0; 6712 } 6713 6714 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx, 6715 unsigned temp, int chan, 6716 struct r600_bytecode_alu_src *bc_src, 6717 const struct r600_shader_src *shader_src) 6718 { 6719 struct r600_bytecode_alu alu; 6720 int r; 6721 6722 r600_bytecode_src(bc_src, shader_src, chan); 6723 6724 /* op3 operands don't support abs modifier */ 6725 if (bc_src->abs) { 6726 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */ 6727 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6728 alu.op = ALU_OP1_MOV; 6729 alu.dst.sel = temp; 6730 alu.dst.chan = chan; 6731 alu.dst.write = 1; 6732 6733 alu.src[0] = *bc_src; 6734 alu.last = true; // sufficient? 6735 r = r600_bytecode_add_alu(ctx->bc, &alu); 6736 if (r) 6737 return r; 6738 6739 memset(bc_src, 0, sizeof(*bc_src)); 6740 bc_src->sel = temp; 6741 bc_src->chan = chan; 6742 } 6743 return 0; 6744 } 6745 6746 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst) 6747 { 6748 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6749 struct r600_bytecode_alu alu; 6750 int i, j, r; 6751 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 6752 int temp_regs[4]; 6753 unsigned op = ctx->inst_info->op; 6754 6755 if (op == ALU_OP3_MULADD_IEEE && 6756 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 6757 op = ALU_OP3_MULADD; 6758 6759 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 6760 temp_regs[j] = 0; 6761 if (ctx->src[j].abs) 6762 temp_regs[j] = r600_get_temp(ctx); 6763 } 6764 for (i = 0; i < lasti + 1; i++) { 6765 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 6766 continue; 6767 6768 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6769 alu.op = op; 6770 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 6771 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]); 6772 if (r) 6773 return r; 6774 } 6775 6776 if (dst == -1) { 6777 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6778 } else { 6779 alu.dst.sel = dst; 6780 } 6781 alu.dst.chan = i; 6782 alu.dst.write = 1; 6783 alu.is_op3 = 1; 6784 if (i == lasti) { 6785 alu.last = 1; 6786 } 6787 r = r600_bytecode_add_alu(ctx->bc, &alu); 6788 if (r) 6789 return r; 6790 } 6791 return 0; 6792 } 6793 6794 static int tgsi_op3(struct r600_shader_ctx *ctx) 6795 { 6796 return tgsi_op3_dst(ctx, -1); 6797 } 6798 6799 static int tgsi_dp(struct r600_shader_ctx *ctx) 6800 { 6801 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6802 struct r600_bytecode_alu alu; 6803 int i, j, r; 6804 unsigned op = ctx->inst_info->op; 6805 if (op == ALU_OP2_DOT4_IEEE && 6806 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS]) 6807 op = ALU_OP2_DOT4; 6808 6809 for (i = 0; i < 4; i++) { 6810 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6811 alu.op = op; 6812 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 6813 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 6814 } 6815 6816 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 6817 alu.dst.chan = i; 6818 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; 6819 /* handle some special cases */ 6820 switch (inst->Instruction.Opcode) { 6821 case TGSI_OPCODE_DP2: 6822 if (i > 1) { 6823 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 6824 alu.src[0].chan = alu.src[1].chan = 0; 6825 } 6826 break; 6827 case TGSI_OPCODE_DP3: 6828 if (i > 2) { 6829 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; 6830 alu.src[0].chan = alu.src[1].chan = 0; 6831 } 6832 break; 6833 default: 6834 break; 6835 } 6836 if (i == 3) { 6837 alu.last = 1; 6838 } 6839 r = r600_bytecode_add_alu(ctx->bc, &alu); 6840 if (r) 6841 return r; 6842 } 6843 return 0; 6844 } 6845 6846 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, 6847 unsigned index) 6848 { 6849 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6850 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY && 6851 inst->Src[index].Register.File != TGSI_FILE_INPUT && 6852 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || 6853 ctx->src[index].neg || ctx->src[index].abs || 6854 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY); 6855 } 6856 6857 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, 6858 unsigned index) 6859 { 6860 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6861 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index; 6862 } 6863 6864 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading) 6865 { 6866 struct r600_bytecode_vtx vtx; 6867 struct r600_bytecode_alu alu; 6868 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6869 int src_gpr, r, i; 6870 int id = tgsi_tex_get_src_gpr(ctx, 1); 6871 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 6872 6873 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 6874 if (src_requires_loading) { 6875 for (i = 0; i < 4; i++) { 6876 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6877 alu.op = ALU_OP1_MOV; 6878 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 6879 alu.dst.sel = ctx->temp_reg; 6880 alu.dst.chan = i; 6881 if (i == 3) 6882 alu.last = 1; 6883 alu.dst.write = 1; 6884 r = r600_bytecode_add_alu(ctx->bc, &alu); 6885 if (r) 6886 return r; 6887 } 6888 src_gpr = ctx->temp_reg; 6889 } 6890 6891 memset(&vtx, 0, sizeof(vtx)); 6892 vtx.op = FETCH_OP_VFETCH; 6893 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 6894 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 6895 vtx.src_gpr = src_gpr; 6896 vtx.mega_fetch_count = 16; 6897 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 6898 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 6899 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 6900 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 6901 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 6902 vtx.use_const_fields = 1; 6903 vtx.buffer_index_mode = sampler_index_mode; 6904 6905 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx))) 6906 return r; 6907 6908 if (ctx->bc->chip_class >= EVERGREEN) 6909 return 0; 6910 6911 for (i = 0; i < 4; i++) { 6912 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 6913 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 6914 continue; 6915 6916 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6917 alu.op = ALU_OP2_AND_INT; 6918 6919 alu.dst.chan = i; 6920 alu.dst.sel = vtx.dst_gpr; 6921 alu.dst.write = 1; 6922 6923 alu.src[0].sel = vtx.dst_gpr; 6924 alu.src[0].chan = i; 6925 6926 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL; 6927 alu.src[1].sel += (id * 2); 6928 alu.src[1].chan = i % 4; 6929 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 6930 6931 if (i == lasti) 6932 alu.last = 1; 6933 r = r600_bytecode_add_alu(ctx->bc, &alu); 6934 if (r) 6935 return r; 6936 } 6937 6938 if (inst->Dst[0].Register.WriteMask & 3) { 6939 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6940 alu.op = ALU_OP2_OR_INT; 6941 6942 alu.dst.chan = 3; 6943 alu.dst.sel = vtx.dst_gpr; 6944 alu.dst.write = 1; 6945 6946 alu.src[0].sel = vtx.dst_gpr; 6947 alu.src[0].chan = 3; 6948 6949 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1; 6950 alu.src[1].chan = 0; 6951 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 6952 6953 alu.last = 1; 6954 r = r600_bytecode_add_alu(ctx->bc, &alu); 6955 if (r) 6956 return r; 6957 } 6958 return 0; 6959 } 6960 6961 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset) 6962 { 6963 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 6964 int r; 6965 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset; 6966 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 6967 6968 if (ctx->bc->chip_class < EVERGREEN) { 6969 struct r600_bytecode_alu alu; 6970 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 6971 alu.op = ALU_OP1_MOV; 6972 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 6973 /* r600 we have them at channel 2 of the second dword */ 6974 alu.src[0].sel += (id * 2) + 1; 6975 alu.src[0].chan = 1; 6976 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 6977 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 6978 alu.last = 1; 6979 r = r600_bytecode_add_alu(ctx->bc, &alu); 6980 if (r) 6981 return r; 6982 return 0; 6983 } else { 6984 struct r600_bytecode_vtx vtx; 6985 memset(&vtx, 0, sizeof(vtx)); 6986 vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */ 6987 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS; 6988 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 6989 vtx.src_gpr = 0; 6990 vtx.mega_fetch_count = 16; /* no idea here really... */ 6991 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 6992 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 6993 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */ 6994 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */ 6995 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */ 6996 vtx.data_format = FMT_32_32_32_32; 6997 vtx.buffer_index_mode = sampler_index_mode; 6998 6999 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx))) 7000 return r; 7001 return 0; 7002 } 7003 } 7004 7005 7006 static int tgsi_tex(struct r600_shader_ctx *ctx) 7007 { 7008 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7009 struct r600_bytecode_tex tex; 7010 struct r600_bytecode_alu alu; 7011 unsigned src_gpr; 7012 int r, i, j; 7013 int opcode; 7014 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing && 7015 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7016 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || 7017 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA); 7018 7019 bool txf_add_offsets = inst->Texture.NumOffsets && 7020 inst->Instruction.Opcode == TGSI_OPCODE_TXF && 7021 inst->Texture.Texture != TGSI_TEXTURE_BUFFER; 7022 7023 /* Texture fetch instructions can only use gprs as source. 7024 * Also they cannot negate the source or take the absolute value */ 7025 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS && 7026 tgsi_tex_src_requires_loading(ctx, 0)) || 7027 read_compressed_msaa || txf_add_offsets; 7028 7029 boolean src_loaded = FALSE; 7030 unsigned sampler_src_reg = 1; 7031 int8_t offset_x = 0, offset_y = 0, offset_z = 0; 7032 boolean has_txq_cube_array_z = false; 7033 unsigned sampler_index_mode; 7034 7035 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ && 7036 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7037 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) 7038 if (inst->Dst[0].Register.WriteMask & 4) { 7039 ctx->shader->has_txq_cube_array_z_comp = true; 7040 has_txq_cube_array_z = true; 7041 } 7042 7043 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || 7044 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7045 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 || 7046 inst->Instruction.Opcode == TGSI_OPCODE_TG4) 7047 sampler_src_reg = 2; 7048 7049 /* TGSI moves the sampler to src reg 3 for TXD */ 7050 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) 7051 sampler_src_reg = 3; 7052 7053 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 7054 7055 src_gpr = tgsi_tex_get_src_gpr(ctx, 0); 7056 7057 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { 7058 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) { 7059 if (ctx->bc->chip_class < EVERGREEN) 7060 ctx->shader->uses_tex_buffers = true; 7061 return r600_do_buffer_txq(ctx, 1, 0); 7062 } 7063 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) { 7064 if (ctx->bc->chip_class < EVERGREEN) 7065 ctx->shader->uses_tex_buffers = true; 7066 return do_vtx_fetch_inst(ctx, src_requires_loading); 7067 } 7068 } 7069 7070 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { 7071 int out_chan; 7072 /* Add perspective divide */ 7073 if (ctx->bc->chip_class == CAYMAN) { 7074 out_chan = 2; 7075 for (i = 0; i < 3; i++) { 7076 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7077 alu.op = ALU_OP1_RECIP_IEEE; 7078 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7079 7080 alu.dst.sel = ctx->temp_reg; 7081 alu.dst.chan = i; 7082 if (i == 2) 7083 alu.last = 1; 7084 if (out_chan == i) 7085 alu.dst.write = 1; 7086 r = r600_bytecode_add_alu(ctx->bc, &alu); 7087 if (r) 7088 return r; 7089 } 7090 7091 } else { 7092 out_chan = 3; 7093 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7094 alu.op = ALU_OP1_RECIP_IEEE; 7095 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7096 7097 alu.dst.sel = ctx->temp_reg; 7098 alu.dst.chan = out_chan; 7099 alu.last = 1; 7100 alu.dst.write = 1; 7101 r = r600_bytecode_add_alu(ctx->bc, &alu); 7102 if (r) 7103 return r; 7104 } 7105 7106 for (i = 0; i < 3; i++) { 7107 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7108 alu.op = ALU_OP2_MUL; 7109 alu.src[0].sel = ctx->temp_reg; 7110 alu.src[0].chan = out_chan; 7111 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 7112 alu.dst.sel = ctx->temp_reg; 7113 alu.dst.chan = i; 7114 alu.dst.write = 1; 7115 r = r600_bytecode_add_alu(ctx->bc, &alu); 7116 if (r) 7117 return r; 7118 } 7119 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7120 alu.op = ALU_OP1_MOV; 7121 alu.src[0].sel = V_SQ_ALU_SRC_1; 7122 alu.src[0].chan = 0; 7123 alu.dst.sel = ctx->temp_reg; 7124 alu.dst.chan = 3; 7125 alu.last = 1; 7126 alu.dst.write = 1; 7127 r = r600_bytecode_add_alu(ctx->bc, &alu); 7128 if (r) 7129 return r; 7130 src_loaded = TRUE; 7131 src_gpr = ctx->temp_reg; 7132 } 7133 7134 7135 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || 7136 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7137 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7138 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 7139 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) { 7140 7141 static const unsigned src0_swizzle[] = {2, 2, 0, 1}; 7142 static const unsigned src1_swizzle[] = {1, 0, 2, 2}; 7143 7144 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ 7145 for (i = 0; i < 4; i++) { 7146 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7147 alu.op = ALU_OP2_CUBE; 7148 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]); 7149 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]); 7150 alu.dst.sel = ctx->temp_reg; 7151 alu.dst.chan = i; 7152 if (i == 3) 7153 alu.last = 1; 7154 alu.dst.write = 1; 7155 r = r600_bytecode_add_alu(ctx->bc, &alu); 7156 if (r) 7157 return r; 7158 } 7159 7160 /* tmp1.z = RCP_e(|tmp1.z|) */ 7161 if (ctx->bc->chip_class == CAYMAN) { 7162 for (i = 0; i < 3; i++) { 7163 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7164 alu.op = ALU_OP1_RECIP_IEEE; 7165 alu.src[0].sel = ctx->temp_reg; 7166 alu.src[0].chan = 2; 7167 alu.src[0].abs = 1; 7168 alu.dst.sel = ctx->temp_reg; 7169 alu.dst.chan = i; 7170 if (i == 2) 7171 alu.dst.write = 1; 7172 if (i == 2) 7173 alu.last = 1; 7174 r = r600_bytecode_add_alu(ctx->bc, &alu); 7175 if (r) 7176 return r; 7177 } 7178 } else { 7179 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7180 alu.op = ALU_OP1_RECIP_IEEE; 7181 alu.src[0].sel = ctx->temp_reg; 7182 alu.src[0].chan = 2; 7183 alu.src[0].abs = 1; 7184 alu.dst.sel = ctx->temp_reg; 7185 alu.dst.chan = 2; 7186 alu.dst.write = 1; 7187 alu.last = 1; 7188 r = r600_bytecode_add_alu(ctx->bc, &alu); 7189 if (r) 7190 return r; 7191 } 7192 7193 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x 7194 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x 7195 * muladd has no writemask, have to use another temp 7196 */ 7197 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7198 alu.op = ALU_OP3_MULADD; 7199 alu.is_op3 = 1; 7200 7201 alu.src[0].sel = ctx->temp_reg; 7202 alu.src[0].chan = 0; 7203 alu.src[1].sel = ctx->temp_reg; 7204 alu.src[1].chan = 2; 7205 7206 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7207 alu.src[2].chan = 0; 7208 alu.src[2].value = u_bitcast_f2u(1.5f); 7209 7210 alu.dst.sel = ctx->temp_reg; 7211 alu.dst.chan = 0; 7212 alu.dst.write = 1; 7213 7214 r = r600_bytecode_add_alu(ctx->bc, &alu); 7215 if (r) 7216 return r; 7217 7218 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7219 alu.op = ALU_OP3_MULADD; 7220 alu.is_op3 = 1; 7221 7222 alu.src[0].sel = ctx->temp_reg; 7223 alu.src[0].chan = 1; 7224 alu.src[1].sel = ctx->temp_reg; 7225 alu.src[1].chan = 2; 7226 7227 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; 7228 alu.src[2].chan = 0; 7229 alu.src[2].value = u_bitcast_f2u(1.5f); 7230 7231 alu.dst.sel = ctx->temp_reg; 7232 alu.dst.chan = 1; 7233 alu.dst.write = 1; 7234 7235 alu.last = 1; 7236 r = r600_bytecode_add_alu(ctx->bc, &alu); 7237 if (r) 7238 return r; 7239 /* write initial compare value into Z component 7240 - W src 0 for shadow cube 7241 - X src 1 for shadow cube array */ 7242 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7243 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7244 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7245 alu.op = ALU_OP1_MOV; 7246 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) 7247 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7248 else 7249 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7250 alu.dst.sel = ctx->temp_reg; 7251 alu.dst.chan = 2; 7252 alu.dst.write = 1; 7253 alu.last = 1; 7254 r = r600_bytecode_add_alu(ctx->bc, &alu); 7255 if (r) 7256 return r; 7257 } 7258 7259 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7260 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7261 if (ctx->bc->chip_class >= EVERGREEN) { 7262 int mytmp = r600_get_temp(ctx); 7263 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7264 alu.op = ALU_OP1_MOV; 7265 alu.src[0].sel = ctx->temp_reg; 7266 alu.src[0].chan = 3; 7267 alu.dst.sel = mytmp; 7268 alu.dst.chan = 0; 7269 alu.dst.write = 1; 7270 alu.last = 1; 7271 r = r600_bytecode_add_alu(ctx->bc, &alu); 7272 if (r) 7273 return r; 7274 7275 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */ 7276 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7277 alu.op = ALU_OP3_MULADD; 7278 alu.is_op3 = 1; 7279 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7280 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7281 alu.src[1].chan = 0; 7282 alu.src[1].value = u_bitcast_f2u(8.0f); 7283 alu.src[2].sel = mytmp; 7284 alu.src[2].chan = 0; 7285 alu.dst.sel = ctx->temp_reg; 7286 alu.dst.chan = 3; 7287 alu.dst.write = 1; 7288 alu.last = 1; 7289 r = r600_bytecode_add_alu(ctx->bc, &alu); 7290 if (r) 7291 return r; 7292 } else if (ctx->bc->chip_class < EVERGREEN) { 7293 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7294 tex.op = FETCH_OP_SET_CUBEMAP_INDEX; 7295 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7296 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7297 tex.src_gpr = r600_get_temp(ctx); 7298 tex.src_sel_x = 0; 7299 tex.src_sel_y = 0; 7300 tex.src_sel_z = 0; 7301 tex.src_sel_w = 0; 7302 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 7303 tex.coord_type_x = 1; 7304 tex.coord_type_y = 1; 7305 tex.coord_type_z = 1; 7306 tex.coord_type_w = 1; 7307 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7308 alu.op = ALU_OP1_MOV; 7309 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7310 alu.dst.sel = tex.src_gpr; 7311 alu.dst.chan = 0; 7312 alu.last = 1; 7313 alu.dst.write = 1; 7314 r = r600_bytecode_add_alu(ctx->bc, &alu); 7315 if (r) 7316 return r; 7317 7318 r = r600_bytecode_add_tex(ctx->bc, &tex); 7319 if (r) 7320 return r; 7321 } 7322 7323 } 7324 7325 /* for cube forms of lod and bias we need to route things */ 7326 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB || 7327 inst->Instruction.Opcode == TGSI_OPCODE_TXL || 7328 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7329 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { 7330 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7331 alu.op = ALU_OP1_MOV; 7332 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || 7333 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) 7334 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 7335 else 7336 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); 7337 alu.dst.sel = ctx->temp_reg; 7338 alu.dst.chan = 2; 7339 alu.last = 1; 7340 alu.dst.write = 1; 7341 r = r600_bytecode_add_alu(ctx->bc, &alu); 7342 if (r) 7343 return r; 7344 } 7345 7346 src_loaded = TRUE; 7347 src_gpr = ctx->temp_reg; 7348 } 7349 7350 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) { 7351 int temp_h = 0, temp_v = 0; 7352 int start_val = 0; 7353 7354 /* if we've already loaded the src (i.e. CUBE don't reload it). */ 7355 if (src_loaded == TRUE) 7356 start_val = 1; 7357 else 7358 src_loaded = TRUE; 7359 for (i = start_val; i < 3; i++) { 7360 int treg = r600_get_temp(ctx); 7361 7362 if (i == 0) 7363 src_gpr = treg; 7364 else if (i == 1) 7365 temp_h = treg; 7366 else 7367 temp_v = treg; 7368 7369 for (j = 0; j < 4; j++) { 7370 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7371 alu.op = ALU_OP1_MOV; 7372 r600_bytecode_src(&alu.src[0], &ctx->src[i], j); 7373 alu.dst.sel = treg; 7374 alu.dst.chan = j; 7375 if (j == 3) 7376 alu.last = 1; 7377 alu.dst.write = 1; 7378 r = r600_bytecode_add_alu(ctx->bc, &alu); 7379 if (r) 7380 return r; 7381 } 7382 } 7383 for (i = 1; i < 3; i++) { 7384 /* set gradients h/v */ 7385 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7386 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H : 7387 FETCH_OP_SET_GRADIENTS_V; 7388 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7389 tex.sampler_index_mode = sampler_index_mode; 7390 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7391 tex.resource_index_mode = sampler_index_mode; 7392 7393 tex.src_gpr = (i == 1) ? temp_h : temp_v; 7394 tex.src_sel_x = 0; 7395 tex.src_sel_y = 1; 7396 tex.src_sel_z = 2; 7397 tex.src_sel_w = 3; 7398 7399 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */ 7400 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7; 7401 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { 7402 tex.coord_type_x = 1; 7403 tex.coord_type_y = 1; 7404 tex.coord_type_z = 1; 7405 tex.coord_type_w = 1; 7406 } 7407 r = r600_bytecode_add_tex(ctx->bc, &tex); 7408 if (r) 7409 return r; 7410 } 7411 } 7412 7413 if (src_requires_loading && !src_loaded) { 7414 for (i = 0; i < 4; i++) { 7415 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7416 alu.op = ALU_OP1_MOV; 7417 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 7418 alu.dst.sel = ctx->temp_reg; 7419 alu.dst.chan = i; 7420 if (i == 3) 7421 alu.last = 1; 7422 alu.dst.write = 1; 7423 r = r600_bytecode_add_alu(ctx->bc, &alu); 7424 if (r) 7425 return r; 7426 } 7427 src_loaded = TRUE; 7428 src_gpr = ctx->temp_reg; 7429 } 7430 7431 /* get offset values */ 7432 if (inst->Texture.NumOffsets) { 7433 assert(inst->Texture.NumOffsets == 1); 7434 7435 /* The texture offset feature doesn't work with the TXF instruction 7436 * and must be emulated by adding the offset to the texture coordinates. */ 7437 if (txf_add_offsets) { 7438 const struct tgsi_texture_offset *off = inst->TexOffsets; 7439 7440 switch (inst->Texture.Texture) { 7441 case TGSI_TEXTURE_3D: 7442 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7443 alu.op = ALU_OP2_ADD_INT; 7444 alu.src[0].sel = src_gpr; 7445 alu.src[0].chan = 2; 7446 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7447 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ]; 7448 alu.dst.sel = src_gpr; 7449 alu.dst.chan = 2; 7450 alu.dst.write = 1; 7451 alu.last = 1; 7452 r = r600_bytecode_add_alu(ctx->bc, &alu); 7453 if (r) 7454 return r; 7455 /* fall through */ 7456 7457 case TGSI_TEXTURE_2D: 7458 case TGSI_TEXTURE_SHADOW2D: 7459 case TGSI_TEXTURE_RECT: 7460 case TGSI_TEXTURE_SHADOWRECT: 7461 case TGSI_TEXTURE_2D_ARRAY: 7462 case TGSI_TEXTURE_SHADOW2D_ARRAY: 7463 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7464 alu.op = ALU_OP2_ADD_INT; 7465 alu.src[0].sel = src_gpr; 7466 alu.src[0].chan = 1; 7467 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7468 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY]; 7469 alu.dst.sel = src_gpr; 7470 alu.dst.chan = 1; 7471 alu.dst.write = 1; 7472 alu.last = 1; 7473 r = r600_bytecode_add_alu(ctx->bc, &alu); 7474 if (r) 7475 return r; 7476 /* fall through */ 7477 7478 case TGSI_TEXTURE_1D: 7479 case TGSI_TEXTURE_SHADOW1D: 7480 case TGSI_TEXTURE_1D_ARRAY: 7481 case TGSI_TEXTURE_SHADOW1D_ARRAY: 7482 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7483 alu.op = ALU_OP2_ADD_INT; 7484 alu.src[0].sel = src_gpr; 7485 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7486 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX]; 7487 alu.dst.sel = src_gpr; 7488 alu.dst.write = 1; 7489 alu.last = 1; 7490 r = r600_bytecode_add_alu(ctx->bc, &alu); 7491 if (r) 7492 return r; 7493 break; 7494 /* texture offsets do not apply to other texture targets */ 7495 } 7496 } else { 7497 switch (inst->Texture.Texture) { 7498 case TGSI_TEXTURE_3D: 7499 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1; 7500 /* fallthrough */ 7501 case TGSI_TEXTURE_2D: 7502 case TGSI_TEXTURE_SHADOW2D: 7503 case TGSI_TEXTURE_RECT: 7504 case TGSI_TEXTURE_SHADOWRECT: 7505 case TGSI_TEXTURE_2D_ARRAY: 7506 case TGSI_TEXTURE_SHADOW2D_ARRAY: 7507 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1; 7508 /* fallthrough */ 7509 case TGSI_TEXTURE_1D: 7510 case TGSI_TEXTURE_SHADOW1D: 7511 case TGSI_TEXTURE_1D_ARRAY: 7512 case TGSI_TEXTURE_SHADOW1D_ARRAY: 7513 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1; 7514 } 7515 } 7516 } 7517 7518 /* Obtain the sample index for reading a compressed MSAA color texture. 7519 * To read the FMASK, we use the ldfptr instruction, which tells us 7520 * where the samples are stored. 7521 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210, 7522 * which is the identity mapping. Each nibble says which physical sample 7523 * should be fetched to get that sample. 7524 * 7525 * Assume src.z contains the sample index. It should be modified like this: 7526 * src.z = (ldfptr() >> (src.z * 4)) & 0xF; 7527 * Then fetch the texel with src. 7528 */ 7529 if (read_compressed_msaa) { 7530 unsigned sample_chan = 3; 7531 unsigned temp = r600_get_temp(ctx); 7532 assert(src_loaded); 7533 7534 /* temp.w = ldfptr() */ 7535 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7536 tex.op = FETCH_OP_LD; 7537 tex.inst_mod = 1; /* to indicate this is ldfptr */ 7538 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7539 tex.sampler_index_mode = sampler_index_mode; 7540 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7541 tex.resource_index_mode = sampler_index_mode; 7542 tex.src_gpr = src_gpr; 7543 tex.dst_gpr = temp; 7544 tex.dst_sel_x = 7; /* mask out these components */ 7545 tex.dst_sel_y = 7; 7546 tex.dst_sel_z = 7; 7547 tex.dst_sel_w = 0; /* store X */ 7548 tex.src_sel_x = 0; 7549 tex.src_sel_y = 1; 7550 tex.src_sel_z = 2; 7551 tex.src_sel_w = 3; 7552 tex.offset_x = offset_x; 7553 tex.offset_y = offset_y; 7554 tex.offset_z = offset_z; 7555 r = r600_bytecode_add_tex(ctx->bc, &tex); 7556 if (r) 7557 return r; 7558 7559 /* temp.x = sample_index*4 */ 7560 if (ctx->bc->chip_class == CAYMAN) { 7561 for (i = 0 ; i < 4; i++) { 7562 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7563 alu.op = ALU_OP2_MULLO_INT; 7564 alu.src[0].sel = src_gpr; 7565 alu.src[0].chan = sample_chan; 7566 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7567 alu.src[1].value = 4; 7568 alu.dst.sel = temp; 7569 alu.dst.chan = i; 7570 alu.dst.write = i == 0; 7571 if (i == 3) 7572 alu.last = 1; 7573 r = r600_bytecode_add_alu(ctx->bc, &alu); 7574 if (r) 7575 return r; 7576 } 7577 } else { 7578 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7579 alu.op = ALU_OP2_MULLO_INT; 7580 alu.src[0].sel = src_gpr; 7581 alu.src[0].chan = sample_chan; 7582 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7583 alu.src[1].value = 4; 7584 alu.dst.sel = temp; 7585 alu.dst.chan = 0; 7586 alu.dst.write = 1; 7587 alu.last = 1; 7588 r = r600_bytecode_add_alu(ctx->bc, &alu); 7589 if (r) 7590 return r; 7591 } 7592 7593 /* sample_index = temp.w >> temp.x */ 7594 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7595 alu.op = ALU_OP2_LSHR_INT; 7596 alu.src[0].sel = temp; 7597 alu.src[0].chan = 3; 7598 alu.src[1].sel = temp; 7599 alu.src[1].chan = 0; 7600 alu.dst.sel = src_gpr; 7601 alu.dst.chan = sample_chan; 7602 alu.dst.write = 1; 7603 alu.last = 1; 7604 r = r600_bytecode_add_alu(ctx->bc, &alu); 7605 if (r) 7606 return r; 7607 7608 /* sample_index & 0xF */ 7609 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7610 alu.op = ALU_OP2_AND_INT; 7611 alu.src[0].sel = src_gpr; 7612 alu.src[0].chan = sample_chan; 7613 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7614 alu.src[1].value = 0xF; 7615 alu.dst.sel = src_gpr; 7616 alu.dst.chan = sample_chan; 7617 alu.dst.write = 1; 7618 alu.last = 1; 7619 r = r600_bytecode_add_alu(ctx->bc, &alu); 7620 if (r) 7621 return r; 7622 #if 0 7623 /* visualize the FMASK */ 7624 for (i = 0; i < 4; i++) { 7625 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7626 alu.op = ALU_OP1_INT_TO_FLT; 7627 alu.src[0].sel = src_gpr; 7628 alu.src[0].chan = sample_chan; 7629 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7630 alu.dst.chan = i; 7631 alu.dst.write = 1; 7632 alu.last = 1; 7633 r = r600_bytecode_add_alu(ctx->bc, &alu); 7634 if (r) 7635 return r; 7636 } 7637 return 0; 7638 #endif 7639 } 7640 7641 /* does this shader want a num layers from TXQ for a cube array? */ 7642 if (has_txq_cube_array_z) { 7643 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7644 7645 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7646 alu.op = ALU_OP1_MOV; 7647 7648 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 7649 if (ctx->bc->chip_class >= EVERGREEN) { 7650 /* with eg each dword is number of cubes */ 7651 alu.src[0].sel += id / 4; 7652 alu.src[0].chan = id % 4; 7653 } else { 7654 /* r600 we have them at channel 2 of the second dword */ 7655 alu.src[0].sel += (id * 2) + 1; 7656 alu.src[0].chan = 2; 7657 } 7658 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 7659 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 7660 alu.last = 1; 7661 r = r600_bytecode_add_alu(ctx->bc, &alu); 7662 if (r) 7663 return r; 7664 /* disable writemask from texture instruction */ 7665 inst->Dst[0].Register.WriteMask &= ~4; 7666 } 7667 7668 opcode = ctx->inst_info->op; 7669 if (opcode == FETCH_OP_GATHER4 && 7670 inst->TexOffsets[0].File != TGSI_FILE_NULL && 7671 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) { 7672 opcode = FETCH_OP_GATHER4_O; 7673 7674 /* GATHER4_O/GATHER4_C_O use offset values loaded by 7675 SET_TEXTURE_OFFSETS instruction. The immediate offset values 7676 encoded in the instruction are ignored. */ 7677 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7678 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS; 7679 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7680 tex.sampler_index_mode = sampler_index_mode; 7681 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7682 tex.resource_index_mode = sampler_index_mode; 7683 7684 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index; 7685 tex.src_sel_x = inst->TexOffsets[0].SwizzleX; 7686 tex.src_sel_y = inst->TexOffsets[0].SwizzleY; 7687 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ; 7688 tex.src_sel_w = 4; 7689 7690 tex.dst_sel_x = 7; 7691 tex.dst_sel_y = 7; 7692 tex.dst_sel_z = 7; 7693 tex.dst_sel_w = 7; 7694 7695 r = r600_bytecode_add_tex(ctx->bc, &tex); 7696 if (r) 7697 return r; 7698 } 7699 7700 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 7701 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 7702 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 7703 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7704 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY || 7705 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 7706 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7707 switch (opcode) { 7708 case FETCH_OP_SAMPLE: 7709 opcode = FETCH_OP_SAMPLE_C; 7710 break; 7711 case FETCH_OP_SAMPLE_L: 7712 opcode = FETCH_OP_SAMPLE_C_L; 7713 break; 7714 case FETCH_OP_SAMPLE_LB: 7715 opcode = FETCH_OP_SAMPLE_C_LB; 7716 break; 7717 case FETCH_OP_SAMPLE_G: 7718 opcode = FETCH_OP_SAMPLE_C_G; 7719 break; 7720 /* Texture gather variants */ 7721 case FETCH_OP_GATHER4: 7722 opcode = FETCH_OP_GATHER4_C; 7723 break; 7724 case FETCH_OP_GATHER4_O: 7725 opcode = FETCH_OP_GATHER4_C_O; 7726 break; 7727 } 7728 } 7729 7730 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 7731 tex.op = opcode; 7732 7733 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg); 7734 tex.sampler_index_mode = sampler_index_mode; 7735 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS; 7736 tex.resource_index_mode = sampler_index_mode; 7737 tex.src_gpr = src_gpr; 7738 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7739 7740 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE || 7741 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) { 7742 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */ 7743 } 7744 7745 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) { 7746 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX]; 7747 tex.inst_mod = texture_component_select; 7748 7749 if (ctx->bc->chip_class == CAYMAN) { 7750 /* GATHER4 result order is different from TGSI TG4 */ 7751 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7; 7752 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7; 7753 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7; 7754 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 7755 } else { 7756 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 7757 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 7758 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 7759 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 7760 } 7761 } 7762 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) { 7763 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 7764 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 7765 tex.dst_sel_z = 7; 7766 tex.dst_sel_w = 7; 7767 } 7768 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 7769 tex.dst_sel_x = 3; 7770 tex.dst_sel_y = 7; 7771 tex.dst_sel_z = 7; 7772 tex.dst_sel_w = 7; 7773 } 7774 else { 7775 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 7776 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 7777 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 7778 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 7779 } 7780 7781 7782 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { 7783 tex.src_sel_x = 4; 7784 tex.src_sel_y = 4; 7785 tex.src_sel_z = 4; 7786 tex.src_sel_w = 4; 7787 } else if (src_loaded) { 7788 tex.src_sel_x = 0; 7789 tex.src_sel_y = 1; 7790 tex.src_sel_z = 2; 7791 tex.src_sel_w = 3; 7792 } else { 7793 tex.src_sel_x = ctx->src[0].swizzle[0]; 7794 tex.src_sel_y = ctx->src[0].swizzle[1]; 7795 tex.src_sel_z = ctx->src[0].swizzle[2]; 7796 tex.src_sel_w = ctx->src[0].swizzle[3]; 7797 tex.src_rel = ctx->src[0].rel; 7798 } 7799 7800 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE || 7801 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || 7802 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7803 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { 7804 tex.src_sel_x = 1; 7805 tex.src_sel_y = 0; 7806 tex.src_sel_z = 3; 7807 tex.src_sel_w = 2; /* route Z compare or Lod value into W */ 7808 } 7809 7810 if (inst->Texture.Texture != TGSI_TEXTURE_RECT && 7811 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) { 7812 tex.coord_type_x = 1; 7813 tex.coord_type_y = 1; 7814 } 7815 tex.coord_type_z = 1; 7816 tex.coord_type_w = 1; 7817 7818 tex.offset_x = offset_x; 7819 tex.offset_y = offset_y; 7820 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 && 7821 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 7822 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) { 7823 tex.offset_z = 0; 7824 } 7825 else { 7826 tex.offset_z = offset_z; 7827 } 7828 7829 /* Put the depth for comparison in W. 7830 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W. 7831 * Some instructions expect the depth in Z. */ 7832 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || 7833 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D || 7834 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT || 7835 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) && 7836 opcode != FETCH_OP_SAMPLE_C_L && 7837 opcode != FETCH_OP_SAMPLE_C_LB) { 7838 tex.src_sel_w = tex.src_sel_z; 7839 } 7840 7841 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY || 7842 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) { 7843 if (opcode == FETCH_OP_SAMPLE_C_L || 7844 opcode == FETCH_OP_SAMPLE_C_LB) { 7845 /* the array index is read from Y */ 7846 tex.coord_type_y = 0; 7847 } else { 7848 /* the array index is read from Z */ 7849 tex.coord_type_z = 0; 7850 tex.src_sel_z = tex.src_sel_y; 7851 } 7852 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY || 7853 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY || 7854 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || 7855 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && 7856 (ctx->bc->chip_class >= EVERGREEN))) 7857 /* the array index is read from Z */ 7858 tex.coord_type_z = 0; 7859 7860 /* mask unused source components */ 7861 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) { 7862 switch (inst->Texture.Texture) { 7863 case TGSI_TEXTURE_2D: 7864 case TGSI_TEXTURE_RECT: 7865 tex.src_sel_z = 7; 7866 tex.src_sel_w = 7; 7867 break; 7868 case TGSI_TEXTURE_1D_ARRAY: 7869 tex.src_sel_y = 7; 7870 tex.src_sel_w = 7; 7871 break; 7872 case TGSI_TEXTURE_1D: 7873 tex.src_sel_y = 7; 7874 tex.src_sel_z = 7; 7875 tex.src_sel_w = 7; 7876 break; 7877 } 7878 } 7879 7880 r = r600_bytecode_add_tex(ctx->bc, &tex); 7881 if (r) 7882 return r; 7883 7884 /* add shadow ambient support - gallium doesn't do it yet */ 7885 return 0; 7886 } 7887 7888 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx, 7889 struct tgsi_full_src_register *src) 7890 { 7891 unsigned i; 7892 7893 if (src->Register.Indirect) { 7894 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 7895 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id) 7896 return ctx->shader->atomics[i].hw_idx; 7897 } 7898 } else { 7899 uint32_t index = src->Register.Index; 7900 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) { 7901 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index) 7902 continue; 7903 if (index > ctx->shader->atomics[i].end) 7904 continue; 7905 if (index < ctx->shader->atomics[i].start) 7906 continue; 7907 uint32_t offset = (index - ctx->shader->atomics[i].start); 7908 return ctx->shader->atomics[i].hw_idx + offset; 7909 } 7910 } 7911 assert(0); 7912 return -1; 7913 } 7914 7915 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx, 7916 int *uav_id_p, int *uav_index_mode_p) 7917 { 7918 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7919 int uav_id, uav_index_mode = 0; 7920 int r; 7921 bool is_cm = (ctx->bc->chip_class == CAYMAN); 7922 7923 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]); 7924 7925 if (inst->Src[0].Register.Indirect) { 7926 if (is_cm) { 7927 struct r600_bytecode_alu alu; 7928 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 7929 alu.op = ALU_OP2_LSHL_INT; 7930 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index); 7931 alu.src[0].chan = 0; 7932 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 7933 alu.src[1].value = 2; 7934 alu.dst.sel = ctx->temp_reg; 7935 alu.dst.chan = 0; 7936 alu.dst.write = 1; 7937 alu.last = 1; 7938 r = r600_bytecode_add_alu(ctx->bc, &alu); 7939 if (r) 7940 return r; 7941 7942 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 7943 ctx->temp_reg, 0, 7944 ctx->temp_reg, 0, 7945 V_SQ_ALU_SRC_LITERAL, uav_id * 4); 7946 if (r) 7947 return r; 7948 } else 7949 uav_index_mode = 2; 7950 } else if (is_cm) { 7951 r = single_alu_op2(ctx, ALU_OP1_MOV, 7952 ctx->temp_reg, 0, 7953 V_SQ_ALU_SRC_LITERAL, uav_id * 4, 7954 0, 0); 7955 if (r) 7956 return r; 7957 } 7958 *uav_id_p = uav_id; 7959 *uav_index_mode_p = uav_index_mode; 7960 return 0; 7961 } 7962 7963 static int tgsi_load_gds(struct r600_shader_ctx *ctx) 7964 { 7965 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 7966 int r; 7967 struct r600_bytecode_gds gds; 7968 int uav_id = 0; 7969 int uav_index_mode = 0; 7970 bool is_cm = (ctx->bc->chip_class == CAYMAN); 7971 7972 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 7973 if (r) 7974 return r; 7975 7976 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 7977 gds.op = FETCH_OP_GDS_READ_RET; 7978 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 7979 gds.uav_id = is_cm ? 0 : uav_id; 7980 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 7981 gds.src_gpr = ctx->temp_reg; 7982 gds.src_sel_x = (is_cm) ? 0 : 4; 7983 gds.src_sel_y = 4; 7984 gds.src_sel_z = 4; 7985 gds.dst_sel_x = 0; 7986 gds.dst_sel_y = 7; 7987 gds.dst_sel_z = 7; 7988 gds.dst_sel_w = 7; 7989 gds.src_gpr2 = 0; 7990 gds.alloc_consume = !is_cm; 7991 r = r600_bytecode_add_gds(ctx->bc, &gds); 7992 if (r) 7993 return r; 7994 7995 ctx->bc->cf_last->vpm = 1; 7996 return 0; 7997 } 7998 7999 /* this fixes up 1D arrays properly */ 8000 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr) 8001 { 8002 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8003 int r, i; 8004 struct r600_bytecode_alu alu; 8005 int temp_reg = r600_get_temp(ctx); 8006 8007 for (i = 0; i < 4; i++) { 8008 bool def_val = true, write_zero = false; 8009 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8010 alu.op = ALU_OP1_MOV; 8011 alu.dst.sel = temp_reg; 8012 alu.dst.chan = i; 8013 8014 switch (inst->Memory.Texture) { 8015 case TGSI_TEXTURE_BUFFER: 8016 case TGSI_TEXTURE_1D: 8017 if (i == 1 || i == 2 || i == 3) { 8018 write_zero = true; 8019 } 8020 break; 8021 case TGSI_TEXTURE_1D_ARRAY: 8022 if (i == 1 || i == 3) 8023 write_zero = true; 8024 else if (i == 2) { 8025 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1); 8026 def_val = false; 8027 } 8028 break; 8029 case TGSI_TEXTURE_2D: 8030 if (i == 2 || i == 3) 8031 write_zero = true; 8032 break; 8033 default: 8034 if (i == 3) 8035 write_zero = true; 8036 break; 8037 } 8038 8039 if (write_zero) { 8040 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 8041 alu.src[0].value = 0; 8042 } else if (def_val) { 8043 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i); 8044 } 8045 8046 if (i == 3) 8047 alu.last = 1; 8048 alu.dst.write = 1; 8049 r = r600_bytecode_add_alu(ctx->bc, &alu); 8050 if (r) 8051 return r; 8052 } 8053 *idx_gpr = temp_reg; 8054 return 0; 8055 } 8056 8057 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx, 8058 int temp_reg) 8059 { 8060 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8061 int r; 8062 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) { 8063 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]); 8064 r = single_alu_op2(ctx, ALU_OP1_MOV, 8065 temp_reg, 0, 8066 V_SQ_ALU_SRC_LITERAL, value >> 2, 8067 0, 0); 8068 if (r) 8069 return r; 8070 } else { 8071 struct r600_bytecode_alu alu; 8072 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8073 alu.op = ALU_OP2_LSHR_INT; 8074 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0); 8075 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 8076 alu.src[1].value = 2; 8077 alu.dst.sel = temp_reg; 8078 alu.dst.write = 1; 8079 alu.last = 1; 8080 r = r600_bytecode_add_alu(ctx->bc, &alu); 8081 if (r) 8082 return r; 8083 } 8084 return 0; 8085 } 8086 8087 static int tgsi_load_buffer(struct r600_shader_ctx *ctx) 8088 { 8089 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8090 /* have to work out the offset into the RAT immediate return buffer */ 8091 struct r600_bytecode_vtx vtx; 8092 struct r600_bytecode_cf *cf; 8093 int r; 8094 int temp_reg = r600_get_temp(ctx); 8095 unsigned rat_index_mode; 8096 unsigned base; 8097 8098 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8099 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE]; 8100 8101 r = load_buffer_coord(ctx, 1, temp_reg); 8102 if (r) 8103 return r; 8104 ctx->bc->cf_last->barrier = 1; 8105 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8106 vtx.op = FETCH_OP_VFETCH; 8107 vtx.buffer_id = inst->Src[0].Register.Index + base; 8108 vtx.buffer_index_mode = rat_index_mode; 8109 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8110 vtx.src_gpr = temp_reg; 8111 vtx.src_sel_x = 0; 8112 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8113 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */ 8114 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */ 8115 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */ 8116 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */ 8117 vtx.num_format_all = 1; 8118 vtx.format_comp_all = 1; 8119 vtx.srf_mode_all = 0; 8120 8121 if (inst->Dst[0].Register.WriteMask & 8) { 8122 vtx.data_format = FMT_32_32_32_32; 8123 vtx.use_const_fields = 0; 8124 } else if (inst->Dst[0].Register.WriteMask & 4) { 8125 vtx.data_format = FMT_32_32_32; 8126 vtx.use_const_fields = 0; 8127 } else if (inst->Dst[0].Register.WriteMask & 2) { 8128 vtx.data_format = FMT_32_32; 8129 vtx.use_const_fields = 0; 8130 } else { 8131 vtx.data_format = FMT_32; 8132 vtx.use_const_fields = 0; 8133 } 8134 8135 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8136 if (r) 8137 return r; 8138 cf = ctx->bc->cf_last; 8139 cf->barrier = 1; 8140 return 0; 8141 } 8142 8143 static int tgsi_load_rat(struct r600_shader_ctx *ctx) 8144 { 8145 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8146 /* have to work out the offset into the RAT immediate return buffer */ 8147 struct r600_bytecode_vtx vtx; 8148 struct r600_bytecode_cf *cf; 8149 int r; 8150 int idx_gpr; 8151 unsigned format, num_format, format_comp, endian; 8152 const struct util_format_description *desc; 8153 unsigned rat_index_mode; 8154 unsigned immed_base; 8155 8156 r = load_thread_id_gpr(ctx); 8157 if (r) 8158 return r; 8159 8160 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8161 8162 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 8163 r = load_index_src(ctx, 1, &idx_gpr); 8164 if (r) 8165 return r; 8166 8167 if (rat_index_mode) 8168 egcm_load_index_reg(ctx->bc, 1, false); 8169 8170 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8171 cf = ctx->bc->cf_last; 8172 8173 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index; 8174 cf->rat.inst = V_RAT_INST_NOP_RTN; 8175 cf->rat.index_mode = rat_index_mode; 8176 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 8177 cf->output.gpr = ctx->thread_id_gpr; 8178 cf->output.index_gpr = idx_gpr; 8179 cf->output.comp_mask = 0xf; 8180 cf->output.burst_count = 1; 8181 cf->vpm = 1; 8182 cf->barrier = 1; 8183 cf->mark = 1; 8184 cf->output.elem_size = 0; 8185 8186 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 8187 cf = ctx->bc->cf_last; 8188 cf->barrier = 1; 8189 8190 desc = util_format_description(inst->Memory.Format); 8191 r600_vertex_data_type(inst->Memory.Format, 8192 &format, &num_format, &format_comp, &endian); 8193 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8194 vtx.op = FETCH_OP_VFETCH; 8195 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 8196 vtx.buffer_index_mode = rat_index_mode; 8197 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8198 vtx.src_gpr = ctx->thread_id_gpr; 8199 vtx.src_sel_x = 1; 8200 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8201 vtx.dst_sel_x = desc->swizzle[0]; 8202 vtx.dst_sel_y = desc->swizzle[1]; 8203 vtx.dst_sel_z = desc->swizzle[2]; 8204 vtx.dst_sel_w = desc->swizzle[3]; 8205 vtx.srf_mode_all = 1; 8206 vtx.data_format = format; 8207 vtx.num_format_all = num_format; 8208 vtx.format_comp_all = format_comp; 8209 vtx.endian = endian; 8210 vtx.offset = 0; 8211 vtx.mega_fetch_count = 3; 8212 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8213 if (r) 8214 return r; 8215 cf = ctx->bc->cf_last; 8216 cf->barrier = 1; 8217 return 0; 8218 } 8219 8220 static int tgsi_load_lds(struct r600_shader_ctx *ctx) 8221 { 8222 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8223 struct r600_bytecode_alu alu; 8224 int r; 8225 int temp_reg = r600_get_temp(ctx); 8226 8227 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8228 alu.op = ALU_OP1_MOV; 8229 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 8230 alu.dst.sel = temp_reg; 8231 alu.dst.write = 1; 8232 alu.last = 1; 8233 r = r600_bytecode_add_alu(ctx->bc, &alu); 8234 if (r) 8235 return r; 8236 8237 r = do_lds_fetch_values(ctx, temp_reg, 8238 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask); 8239 if (r) 8240 return r; 8241 return 0; 8242 } 8243 8244 static int tgsi_load(struct r600_shader_ctx *ctx) 8245 { 8246 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8247 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 8248 return tgsi_load_rat(ctx); 8249 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 8250 return tgsi_load_gds(ctx); 8251 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 8252 return tgsi_load_buffer(ctx); 8253 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 8254 return tgsi_load_lds(ctx); 8255 return 0; 8256 } 8257 8258 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx) 8259 { 8260 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8261 struct r600_bytecode_cf *cf; 8262 int r, i; 8263 unsigned rat_index_mode; 8264 int lasti; 8265 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx); 8266 8267 r = load_buffer_coord(ctx, 0, treg2); 8268 if (r) 8269 return r; 8270 8271 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8272 if (rat_index_mode) 8273 egcm_load_index_reg(ctx->bc, 1, false); 8274 8275 for (i = 0; i <= 3; i++) { 8276 struct r600_bytecode_alu alu; 8277 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8278 alu.op = ALU_OP1_MOV; 8279 alu.dst.sel = temp_reg; 8280 alu.dst.chan = i; 8281 alu.src[0].sel = V_SQ_ALU_SRC_0; 8282 alu.last = (i == 3); 8283 alu.dst.write = 1; 8284 r = r600_bytecode_add_alu(ctx->bc, &alu); 8285 if (r) 8286 return r; 8287 } 8288 8289 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 8290 for (i = 0; i <= lasti; i++) { 8291 struct r600_bytecode_alu alu; 8292 if (!((1 << i) & inst->Dst[0].Register.WriteMask)) 8293 continue; 8294 8295 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8296 temp_reg, 0, 8297 treg2, 0, 8298 V_SQ_ALU_SRC_LITERAL, i); 8299 if (r) 8300 return r; 8301 8302 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8303 alu.op = ALU_OP1_MOV; 8304 alu.dst.sel = ctx->temp_reg; 8305 alu.dst.chan = 0; 8306 8307 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 8308 alu.last = 1; 8309 alu.dst.write = 1; 8310 r = r600_bytecode_add_alu(ctx->bc, &alu); 8311 if (r) 8312 return r; 8313 8314 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8315 cf = ctx->bc->cf_last; 8316 8317 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE]; 8318 cf->rat.inst = V_RAT_INST_STORE_TYPED; 8319 cf->rat.index_mode = rat_index_mode; 8320 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 8321 cf->output.gpr = ctx->temp_reg; 8322 cf->output.index_gpr = temp_reg; 8323 cf->output.comp_mask = 1; 8324 cf->output.burst_count = 1; 8325 cf->vpm = 1; 8326 cf->barrier = 1; 8327 cf->output.elem_size = 0; 8328 } 8329 return 0; 8330 } 8331 8332 static int tgsi_store_rat(struct r600_shader_ctx *ctx) 8333 { 8334 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8335 struct r600_bytecode_cf *cf; 8336 bool src_requires_loading = false; 8337 int val_gpr, idx_gpr; 8338 int r, i; 8339 unsigned rat_index_mode; 8340 8341 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8342 8343 r = load_index_src(ctx, 0, &idx_gpr); 8344 if (r) 8345 return r; 8346 8347 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY) 8348 src_requires_loading = true; 8349 8350 if (src_requires_loading) { 8351 struct r600_bytecode_alu alu; 8352 for (i = 0; i < 4; i++) { 8353 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8354 alu.op = ALU_OP1_MOV; 8355 alu.dst.sel = ctx->temp_reg; 8356 alu.dst.chan = i; 8357 8358 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 8359 if (i == 3) 8360 alu.last = 1; 8361 alu.dst.write = 1; 8362 r = r600_bytecode_add_alu(ctx->bc, &alu); 8363 if (r) 8364 return r; 8365 } 8366 val_gpr = ctx->temp_reg; 8367 } else 8368 val_gpr = tgsi_tex_get_src_gpr(ctx, 1); 8369 if (rat_index_mode) 8370 egcm_load_index_reg(ctx->bc, 1, false); 8371 8372 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8373 cf = ctx->bc->cf_last; 8374 8375 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index; 8376 cf->rat.inst = V_RAT_INST_STORE_TYPED; 8377 cf->rat.index_mode = rat_index_mode; 8378 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; 8379 cf->output.gpr = val_gpr; 8380 cf->output.index_gpr = idx_gpr; 8381 cf->output.comp_mask = 0xf; 8382 cf->output.burst_count = 1; 8383 cf->vpm = 1; 8384 cf->barrier = 1; 8385 cf->output.elem_size = 0; 8386 return 0; 8387 } 8388 8389 static int tgsi_store_lds(struct r600_shader_ctx *ctx) 8390 { 8391 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8392 struct r600_bytecode_alu alu; 8393 int r, i, lasti; 8394 int write_mask = inst->Dst[0].Register.WriteMask; 8395 int temp_reg = r600_get_temp(ctx); 8396 8397 /* LDS write */ 8398 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8399 alu.op = ALU_OP1_MOV; 8400 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 8401 alu.dst.sel = temp_reg; 8402 alu.dst.write = 1; 8403 alu.last = 1; 8404 r = r600_bytecode_add_alu(ctx->bc, &alu); 8405 if (r) 8406 return r; 8407 8408 lasti = tgsi_last_instruction(write_mask); 8409 for (i = 1; i <= lasti; i++) { 8410 if (!(write_mask & (1 << i))) 8411 continue; 8412 r = single_alu_op2(ctx, ALU_OP2_ADD_INT, 8413 temp_reg, i, 8414 temp_reg, 0, 8415 V_SQ_ALU_SRC_LITERAL, 4 * i); 8416 if (r) 8417 return r; 8418 } 8419 for (i = 0; i <= lasti; i++) { 8420 if (!(write_mask & (1 << i))) 8421 continue; 8422 8423 if ((i == 0 && ((write_mask & 3) == 3)) || 8424 (i == 2 && ((write_mask & 0xc) == 0xc))) { 8425 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8426 alu.op = LDS_OP3_LDS_WRITE_REL; 8427 8428 alu.src[0].sel = temp_reg; 8429 alu.src[0].chan = i; 8430 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 8431 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1); 8432 alu.last = 1; 8433 alu.is_lds_idx_op = true; 8434 alu.lds_idx = 1; 8435 r = r600_bytecode_add_alu(ctx->bc, &alu); 8436 if (r) 8437 return r; 8438 i += 1; 8439 continue; 8440 } 8441 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8442 alu.op = LDS_OP2_LDS_WRITE; 8443 8444 alu.src[0].sel = temp_reg; 8445 alu.src[0].chan = i; 8446 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 8447 8448 alu.last = 1; 8449 alu.is_lds_idx_op = true; 8450 8451 r = r600_bytecode_add_alu(ctx->bc, &alu); 8452 if (r) 8453 return r; 8454 } 8455 return 0; 8456 } 8457 8458 static int tgsi_store(struct r600_shader_ctx *ctx) 8459 { 8460 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8461 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) 8462 return tgsi_store_buffer_rat(ctx); 8463 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY) 8464 return tgsi_store_lds(ctx); 8465 else 8466 return tgsi_store_rat(ctx); 8467 } 8468 8469 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx) 8470 { 8471 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8472 /* have to work out the offset into the RAT immediate return buffer */ 8473 struct r600_bytecode_alu alu; 8474 struct r600_bytecode_vtx vtx; 8475 struct r600_bytecode_cf *cf; 8476 int r; 8477 int idx_gpr; 8478 unsigned format, num_format, format_comp, endian; 8479 const struct util_format_description *desc; 8480 unsigned rat_index_mode; 8481 unsigned immed_base; 8482 unsigned rat_base; 8483 8484 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET; 8485 rat_base = ctx->shader->rat_base; 8486 8487 r = load_thread_id_gpr(ctx); 8488 if (r) 8489 return r; 8490 8491 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { 8492 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 8493 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE]; 8494 8495 r = load_buffer_coord(ctx, 1, ctx->temp_reg); 8496 if (r) 8497 return r; 8498 idx_gpr = ctx->temp_reg; 8499 } else { 8500 r = load_index_src(ctx, 1, &idx_gpr); 8501 if (r) 8502 return r; 8503 } 8504 8505 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8506 8507 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) { 8508 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8509 alu.op = ALU_OP1_MOV; 8510 alu.dst.sel = ctx->thread_id_gpr; 8511 alu.dst.chan = 0; 8512 alu.dst.write = 1; 8513 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0); 8514 alu.last = 1; 8515 r = r600_bytecode_add_alu(ctx->bc, &alu); 8516 if (r) 8517 return r; 8518 8519 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8520 alu.op = ALU_OP1_MOV; 8521 alu.dst.sel = ctx->thread_id_gpr; 8522 if (ctx->bc->chip_class == CAYMAN) 8523 alu.dst.chan = 2; 8524 else 8525 alu.dst.chan = 3; 8526 alu.dst.write = 1; 8527 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 8528 alu.last = 1; 8529 r = r600_bytecode_add_alu(ctx->bc, &alu); 8530 if (r) 8531 return r; 8532 } else { 8533 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8534 alu.op = ALU_OP1_MOV; 8535 alu.dst.sel = ctx->thread_id_gpr; 8536 alu.dst.chan = 0; 8537 alu.dst.write = 1; 8538 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 8539 alu.last = 1; 8540 r = r600_bytecode_add_alu(ctx->bc, &alu); 8541 if (r) 8542 return r; 8543 } 8544 8545 if (rat_index_mode) 8546 egcm_load_index_reg(ctx->bc, 1, false); 8547 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT); 8548 cf = ctx->bc->cf_last; 8549 8550 cf->rat.id = rat_base + inst->Src[0].Register.Index; 8551 cf->rat.inst = ctx->inst_info->op; 8552 cf->rat.index_mode = rat_index_mode; 8553 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND; 8554 cf->output.gpr = ctx->thread_id_gpr; 8555 cf->output.index_gpr = idx_gpr; 8556 cf->output.comp_mask = 0xf; 8557 cf->output.burst_count = 1; 8558 cf->vpm = 1; 8559 cf->barrier = 1; 8560 cf->mark = 1; 8561 cf->output.elem_size = 0; 8562 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK); 8563 cf = ctx->bc->cf_last; 8564 cf->barrier = 1; 8565 cf->cf_addr = 1; 8566 8567 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); 8568 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { 8569 desc = util_format_description(inst->Memory.Format); 8570 r600_vertex_data_type(inst->Memory.Format, 8571 &format, &num_format, &format_comp, &endian); 8572 vtx.dst_sel_x = desc->swizzle[0]; 8573 } else { 8574 format = FMT_32; 8575 num_format = 1; 8576 format_comp = 0; 8577 endian = 0; 8578 vtx.dst_sel_x = 0; 8579 } 8580 vtx.op = FETCH_OP_VFETCH; 8581 vtx.buffer_id = immed_base + inst->Src[0].Register.Index; 8582 vtx.buffer_index_mode = rat_index_mode; 8583 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; 8584 vtx.src_gpr = ctx->thread_id_gpr; 8585 vtx.src_sel_x = 1; 8586 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8587 vtx.dst_sel_y = 7; 8588 vtx.dst_sel_z = 7; 8589 vtx.dst_sel_w = 7; 8590 vtx.use_const_fields = 0; 8591 vtx.srf_mode_all = 1; 8592 vtx.data_format = format; 8593 vtx.num_format_all = num_format; 8594 vtx.format_comp_all = format_comp; 8595 vtx.endian = endian; 8596 vtx.offset = 0; 8597 vtx.mega_fetch_count = 0xf; 8598 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx); 8599 if (r) 8600 return r; 8601 cf = ctx->bc->cf_last; 8602 cf->vpm = 1; 8603 cf->barrier = 1; 8604 return 0; 8605 } 8606 8607 static int get_gds_op(int opcode) 8608 { 8609 switch (opcode) { 8610 case TGSI_OPCODE_ATOMUADD: 8611 return FETCH_OP_GDS_ADD_RET; 8612 case TGSI_OPCODE_ATOMAND: 8613 return FETCH_OP_GDS_AND_RET; 8614 case TGSI_OPCODE_ATOMOR: 8615 return FETCH_OP_GDS_OR_RET; 8616 case TGSI_OPCODE_ATOMXOR: 8617 return FETCH_OP_GDS_XOR_RET; 8618 case TGSI_OPCODE_ATOMUMIN: 8619 return FETCH_OP_GDS_MIN_UINT_RET; 8620 case TGSI_OPCODE_ATOMUMAX: 8621 return FETCH_OP_GDS_MAX_UINT_RET; 8622 case TGSI_OPCODE_ATOMXCHG: 8623 return FETCH_OP_GDS_XCHG_RET; 8624 case TGSI_OPCODE_ATOMCAS: 8625 return FETCH_OP_GDS_CMP_XCHG_RET; 8626 default: 8627 return -1; 8628 } 8629 } 8630 8631 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx) 8632 { 8633 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8634 struct r600_bytecode_gds gds; 8635 struct r600_bytecode_alu alu; 8636 int gds_op = get_gds_op(inst->Instruction.Opcode); 8637 int r; 8638 int uav_id = 0; 8639 int uav_index_mode = 0; 8640 bool is_cm = (ctx->bc->chip_class == CAYMAN); 8641 8642 if (gds_op == -1) { 8643 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode); 8644 return -1; 8645 } 8646 8647 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode); 8648 if (r) 8649 return r; 8650 8651 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) { 8652 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]); 8653 int abs_value = abs(value); 8654 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET) 8655 gds_op = FETCH_OP_GDS_SUB_RET; 8656 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8657 alu.op = ALU_OP1_MOV; 8658 alu.dst.sel = ctx->temp_reg; 8659 alu.dst.chan = is_cm ? 1 : 0; 8660 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; 8661 alu.src[0].value = abs_value; 8662 alu.last = 1; 8663 alu.dst.write = 1; 8664 r = r600_bytecode_add_alu(ctx->bc, &alu); 8665 if (r) 8666 return r; 8667 } else { 8668 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8669 alu.op = ALU_OP1_MOV; 8670 alu.dst.sel = ctx->temp_reg; 8671 alu.dst.chan = is_cm ? 1 : 0; 8672 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0); 8673 alu.last = 1; 8674 alu.dst.write = 1; 8675 r = r600_bytecode_add_alu(ctx->bc, &alu); 8676 if (r) 8677 return r; 8678 } 8679 8680 8681 memset(&gds, 0, sizeof(struct r600_bytecode_gds)); 8682 gds.op = gds_op; 8683 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8684 gds.uav_id = is_cm ? 0 : uav_id; 8685 gds.uav_index_mode = is_cm ? 0 : uav_index_mode; 8686 gds.src_gpr = ctx->temp_reg; 8687 gds.src_gpr2 = 0; 8688 gds.src_sel_x = is_cm ? 0 : 4; 8689 gds.src_sel_y = is_cm ? 1 : 0; 8690 gds.src_sel_z = 7; 8691 gds.dst_sel_x = 0; 8692 gds.dst_sel_y = 7; 8693 gds.dst_sel_z = 7; 8694 gds.dst_sel_w = 7; 8695 gds.alloc_consume = !is_cm; 8696 8697 r = r600_bytecode_add_gds(ctx->bc, &gds); 8698 if (r) 8699 return r; 8700 ctx->bc->cf_last->vpm = 1; 8701 return 0; 8702 } 8703 8704 static int get_lds_op(int opcode) 8705 { 8706 switch (opcode) { 8707 case TGSI_OPCODE_ATOMUADD: 8708 return LDS_OP2_LDS_ADD_RET; 8709 case TGSI_OPCODE_ATOMAND: 8710 return LDS_OP2_LDS_AND_RET; 8711 case TGSI_OPCODE_ATOMOR: 8712 return LDS_OP2_LDS_OR_RET; 8713 case TGSI_OPCODE_ATOMXOR: 8714 return LDS_OP2_LDS_XOR_RET; 8715 case TGSI_OPCODE_ATOMUMIN: 8716 return LDS_OP2_LDS_MIN_UINT_RET; 8717 case TGSI_OPCODE_ATOMUMAX: 8718 return LDS_OP2_LDS_MAX_UINT_RET; 8719 case TGSI_OPCODE_ATOMIMIN: 8720 return LDS_OP2_LDS_MIN_INT_RET; 8721 case TGSI_OPCODE_ATOMIMAX: 8722 return LDS_OP2_LDS_MAX_INT_RET; 8723 case TGSI_OPCODE_ATOMXCHG: 8724 return LDS_OP2_LDS_XCHG_RET; 8725 case TGSI_OPCODE_ATOMCAS: 8726 return LDS_OP3_LDS_CMP_XCHG_RET; 8727 default: 8728 return -1; 8729 } 8730 } 8731 8732 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx) 8733 { 8734 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8735 int lds_op = get_lds_op(inst->Instruction.Opcode); 8736 int r; 8737 8738 struct r600_bytecode_alu alu; 8739 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8740 alu.op = lds_op; 8741 alu.is_lds_idx_op = true; 8742 alu.last = 1; 8743 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0); 8744 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0); 8745 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET) 8746 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0); 8747 else 8748 alu.src[2].sel = V_SQ_ALU_SRC_0; 8749 r = r600_bytecode_add_alu(ctx->bc, &alu); 8750 if (r) 8751 return r; 8752 8753 /* then read from LDS_OQ_A_POP */ 8754 memset(&alu, 0, sizeof(alu)); 8755 8756 alu.op = ALU_OP1_MOV; 8757 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; 8758 alu.src[0].chan = 0; 8759 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 8760 alu.dst.write = 1; 8761 alu.last = 1; 8762 r = r600_bytecode_add_alu(ctx->bc, &alu); 8763 if (r) 8764 return r; 8765 8766 return 0; 8767 } 8768 8769 static int tgsi_atomic_op(struct r600_shader_ctx *ctx) 8770 { 8771 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8772 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) 8773 return tgsi_atomic_op_rat(ctx); 8774 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC) 8775 return tgsi_atomic_op_gds(ctx); 8776 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) 8777 return tgsi_atomic_op_rat(ctx); 8778 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY) 8779 return tgsi_atomic_op_lds(ctx); 8780 return 0; 8781 } 8782 8783 static int tgsi_resq(struct r600_shader_ctx *ctx) 8784 { 8785 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8786 unsigned sampler_index_mode; 8787 struct r600_bytecode_tex tex; 8788 int r; 8789 boolean has_txq_cube_array_z = false; 8790 8791 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || 8792 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) { 8793 if (ctx->bc->chip_class < EVERGREEN) 8794 ctx->shader->uses_tex_buffers = true; 8795 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset); 8796 } 8797 8798 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY && 8799 inst->Dst[0].Register.WriteMask & 4) { 8800 ctx->shader->has_txq_cube_array_z_comp = true; 8801 has_txq_cube_array_z = true; 8802 } 8803 8804 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE 8805 if (sampler_index_mode) 8806 egcm_load_index_reg(ctx->bc, 1, false); 8807 8808 8809 /* does this shader want a num layers from TXQ for a cube array? */ 8810 if (has_txq_cube_array_z) { 8811 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset; 8812 struct r600_bytecode_alu alu; 8813 8814 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8815 alu.op = ALU_OP1_MOV; 8816 8817 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; 8818 /* with eg each dword is either number of cubes */ 8819 alu.src[0].sel += id / 4; 8820 alu.src[0].chan = id % 4; 8821 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; 8822 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); 8823 alu.last = 1; 8824 r = r600_bytecode_add_alu(ctx->bc, &alu); 8825 if (r) 8826 return r; 8827 /* disable writemask from texture instruction */ 8828 inst->Dst[0].Register.WriteMask &= ~4; 8829 } 8830 memset(&tex, 0, sizeof(struct r600_bytecode_tex)); 8831 tex.op = ctx->inst_info->op; 8832 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index; 8833 tex.sampler_index_mode = sampler_index_mode; 8834 tex.resource_id = tex.sampler_id; 8835 tex.resource_index_mode = sampler_index_mode; 8836 tex.src_sel_x = 4; 8837 tex.src_sel_y = 4; 8838 tex.src_sel_z = 4; 8839 tex.src_sel_w = 4; 8840 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; 8841 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; 8842 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; 8843 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; 8844 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index; 8845 r = r600_bytecode_add_tex(ctx->bc, &tex); 8846 if (r) 8847 return r; 8848 8849 return 0; 8850 } 8851 8852 static int tgsi_lrp(struct r600_shader_ctx *ctx) 8853 { 8854 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8855 struct r600_bytecode_alu alu; 8856 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 8857 unsigned i, temp_regs[2]; 8858 int r; 8859 8860 /* optimize if it's just an equal balance */ 8861 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) { 8862 for (i = 0; i < lasti + 1; i++) { 8863 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 8864 continue; 8865 8866 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8867 alu.op = ALU_OP2_ADD; 8868 r600_bytecode_src(&alu.src[0], &ctx->src[1], i); 8869 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 8870 alu.omod = 3; 8871 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 8872 alu.dst.chan = i; 8873 if (i == lasti) { 8874 alu.last = 1; 8875 } 8876 r = r600_bytecode_add_alu(ctx->bc, &alu); 8877 if (r) 8878 return r; 8879 } 8880 return 0; 8881 } 8882 8883 /* 1 - src0 */ 8884 for (i = 0; i < lasti + 1; i++) { 8885 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 8886 continue; 8887 8888 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8889 alu.op = ALU_OP2_ADD; 8890 alu.src[0].sel = V_SQ_ALU_SRC_1; 8891 alu.src[0].chan = 0; 8892 r600_bytecode_src(&alu.src[1], &ctx->src[0], i); 8893 r600_bytecode_src_toggle_neg(&alu.src[1]); 8894 alu.dst.sel = ctx->temp_reg; 8895 alu.dst.chan = i; 8896 if (i == lasti) { 8897 alu.last = 1; 8898 } 8899 alu.dst.write = 1; 8900 r = r600_bytecode_add_alu(ctx->bc, &alu); 8901 if (r) 8902 return r; 8903 } 8904 8905 /* (1 - src0) * src2 */ 8906 for (i = 0; i < lasti + 1; i++) { 8907 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 8908 continue; 8909 8910 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8911 alu.op = ALU_OP2_MUL; 8912 alu.src[0].sel = ctx->temp_reg; 8913 alu.src[0].chan = i; 8914 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 8915 alu.dst.sel = ctx->temp_reg; 8916 alu.dst.chan = i; 8917 if (i == lasti) { 8918 alu.last = 1; 8919 } 8920 alu.dst.write = 1; 8921 r = r600_bytecode_add_alu(ctx->bc, &alu); 8922 if (r) 8923 return r; 8924 } 8925 8926 /* src0 * src1 + (1 - src0) * src2 */ 8927 if (ctx->src[0].abs) 8928 temp_regs[0] = r600_get_temp(ctx); 8929 else 8930 temp_regs[0] = 0; 8931 if (ctx->src[1].abs) 8932 temp_regs[1] = r600_get_temp(ctx); 8933 else 8934 temp_regs[1] = 0; 8935 8936 for (i = 0; i < lasti + 1; i++) { 8937 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 8938 continue; 8939 8940 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8941 alu.op = ALU_OP3_MULADD; 8942 alu.is_op3 = 1; 8943 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]); 8944 if (r) 8945 return r; 8946 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]); 8947 if (r) 8948 return r; 8949 alu.src[2].sel = ctx->temp_reg; 8950 alu.src[2].chan = i; 8951 8952 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 8953 alu.dst.chan = i; 8954 if (i == lasti) { 8955 alu.last = 1; 8956 } 8957 r = r600_bytecode_add_alu(ctx->bc, &alu); 8958 if (r) 8959 return r; 8960 } 8961 return 0; 8962 } 8963 8964 static int tgsi_cmp(struct r600_shader_ctx *ctx) 8965 { 8966 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 8967 struct r600_bytecode_alu alu; 8968 int i, r, j; 8969 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 8970 int temp_regs[3]; 8971 unsigned op; 8972 8973 if (ctx->src[0].abs && ctx->src[0].neg) { 8974 op = ALU_OP3_CNDE; 8975 ctx->src[0].abs = 0; 8976 ctx->src[0].neg = 0; 8977 } else { 8978 op = ALU_OP3_CNDGE; 8979 } 8980 8981 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { 8982 temp_regs[j] = 0; 8983 if (ctx->src[j].abs) 8984 temp_regs[j] = r600_get_temp(ctx); 8985 } 8986 8987 for (i = 0; i < lasti + 1; i++) { 8988 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 8989 continue; 8990 8991 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 8992 alu.op = op; 8993 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]); 8994 if (r) 8995 return r; 8996 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]); 8997 if (r) 8998 return r; 8999 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]); 9000 if (r) 9001 return r; 9002 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9003 alu.dst.chan = i; 9004 alu.dst.write = 1; 9005 alu.is_op3 = 1; 9006 if (i == lasti) 9007 alu.last = 1; 9008 r = r600_bytecode_add_alu(ctx->bc, &alu); 9009 if (r) 9010 return r; 9011 } 9012 return 0; 9013 } 9014 9015 static int tgsi_ucmp(struct r600_shader_ctx *ctx) 9016 { 9017 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9018 struct r600_bytecode_alu alu; 9019 int i, r; 9020 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9021 9022 for (i = 0; i < lasti + 1; i++) { 9023 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9024 continue; 9025 9026 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9027 alu.op = ALU_OP3_CNDE_INT; 9028 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9029 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 9030 r600_bytecode_src(&alu.src[2], &ctx->src[1], i); 9031 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9032 alu.dst.chan = i; 9033 alu.dst.write = 1; 9034 alu.is_op3 = 1; 9035 if (i == lasti) 9036 alu.last = 1; 9037 r = r600_bytecode_add_alu(ctx->bc, &alu); 9038 if (r) 9039 return r; 9040 } 9041 return 0; 9042 } 9043 9044 static int tgsi_exp(struct r600_shader_ctx *ctx) 9045 { 9046 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9047 struct r600_bytecode_alu alu; 9048 int r; 9049 unsigned i; 9050 9051 /* result.x = 2^floor(src); */ 9052 if (inst->Dst[0].Register.WriteMask & 1) { 9053 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9054 9055 alu.op = ALU_OP1_FLOOR; 9056 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9057 9058 alu.dst.sel = ctx->temp_reg; 9059 alu.dst.chan = 0; 9060 alu.dst.write = 1; 9061 alu.last = 1; 9062 r = r600_bytecode_add_alu(ctx->bc, &alu); 9063 if (r) 9064 return r; 9065 9066 if (ctx->bc->chip_class == CAYMAN) { 9067 for (i = 0; i < 3; i++) { 9068 alu.op = ALU_OP1_EXP_IEEE; 9069 alu.src[0].sel = ctx->temp_reg; 9070 alu.src[0].chan = 0; 9071 9072 alu.dst.sel = ctx->temp_reg; 9073 alu.dst.chan = i; 9074 alu.dst.write = i == 0; 9075 alu.last = i == 2; 9076 r = r600_bytecode_add_alu(ctx->bc, &alu); 9077 if (r) 9078 return r; 9079 } 9080 } else { 9081 alu.op = ALU_OP1_EXP_IEEE; 9082 alu.src[0].sel = ctx->temp_reg; 9083 alu.src[0].chan = 0; 9084 9085 alu.dst.sel = ctx->temp_reg; 9086 alu.dst.chan = 0; 9087 alu.dst.write = 1; 9088 alu.last = 1; 9089 r = r600_bytecode_add_alu(ctx->bc, &alu); 9090 if (r) 9091 return r; 9092 } 9093 } 9094 9095 /* result.y = tmp - floor(tmp); */ 9096 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9097 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9098 9099 alu.op = ALU_OP1_FRACT; 9100 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9101 9102 alu.dst.sel = ctx->temp_reg; 9103 #if 0 9104 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9105 if (r) 9106 return r; 9107 #endif 9108 alu.dst.write = 1; 9109 alu.dst.chan = 1; 9110 9111 alu.last = 1; 9112 9113 r = r600_bytecode_add_alu(ctx->bc, &alu); 9114 if (r) 9115 return r; 9116 } 9117 9118 /* result.z = RoughApprox2ToX(tmp);*/ 9119 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { 9120 if (ctx->bc->chip_class == CAYMAN) { 9121 for (i = 0; i < 3; i++) { 9122 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9123 alu.op = ALU_OP1_EXP_IEEE; 9124 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9125 9126 alu.dst.sel = ctx->temp_reg; 9127 alu.dst.chan = i; 9128 if (i == 2) { 9129 alu.dst.write = 1; 9130 alu.last = 1; 9131 } 9132 9133 r = r600_bytecode_add_alu(ctx->bc, &alu); 9134 if (r) 9135 return r; 9136 } 9137 } else { 9138 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9139 alu.op = ALU_OP1_EXP_IEEE; 9140 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9141 9142 alu.dst.sel = ctx->temp_reg; 9143 alu.dst.write = 1; 9144 alu.dst.chan = 2; 9145 9146 alu.last = 1; 9147 9148 r = r600_bytecode_add_alu(ctx->bc, &alu); 9149 if (r) 9150 return r; 9151 } 9152 } 9153 9154 /* result.w = 1.0;*/ 9155 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { 9156 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9157 9158 alu.op = ALU_OP1_MOV; 9159 alu.src[0].sel = V_SQ_ALU_SRC_1; 9160 alu.src[0].chan = 0; 9161 9162 alu.dst.sel = ctx->temp_reg; 9163 alu.dst.chan = 3; 9164 alu.dst.write = 1; 9165 alu.last = 1; 9166 r = r600_bytecode_add_alu(ctx->bc, &alu); 9167 if (r) 9168 return r; 9169 } 9170 return tgsi_helper_copy(ctx, inst); 9171 } 9172 9173 static int tgsi_log(struct r600_shader_ctx *ctx) 9174 { 9175 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9176 struct r600_bytecode_alu alu; 9177 int r; 9178 unsigned i; 9179 9180 /* result.x = floor(log2(|src|)); */ 9181 if (inst->Dst[0].Register.WriteMask & 1) { 9182 if (ctx->bc->chip_class == CAYMAN) { 9183 for (i = 0; i < 3; i++) { 9184 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9185 9186 alu.op = ALU_OP1_LOG_IEEE; 9187 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9188 r600_bytecode_src_set_abs(&alu.src[0]); 9189 9190 alu.dst.sel = ctx->temp_reg; 9191 alu.dst.chan = i; 9192 if (i == 0) 9193 alu.dst.write = 1; 9194 if (i == 2) 9195 alu.last = 1; 9196 r = r600_bytecode_add_alu(ctx->bc, &alu); 9197 if (r) 9198 return r; 9199 } 9200 9201 } else { 9202 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9203 9204 alu.op = ALU_OP1_LOG_IEEE; 9205 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9206 r600_bytecode_src_set_abs(&alu.src[0]); 9207 9208 alu.dst.sel = ctx->temp_reg; 9209 alu.dst.chan = 0; 9210 alu.dst.write = 1; 9211 alu.last = 1; 9212 r = r600_bytecode_add_alu(ctx->bc, &alu); 9213 if (r) 9214 return r; 9215 } 9216 9217 alu.op = ALU_OP1_FLOOR; 9218 alu.src[0].sel = ctx->temp_reg; 9219 alu.src[0].chan = 0; 9220 9221 alu.dst.sel = ctx->temp_reg; 9222 alu.dst.chan = 0; 9223 alu.dst.write = 1; 9224 alu.last = 1; 9225 9226 r = r600_bytecode_add_alu(ctx->bc, &alu); 9227 if (r) 9228 return r; 9229 } 9230 9231 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */ 9232 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { 9233 9234 if (ctx->bc->chip_class == CAYMAN) { 9235 for (i = 0; i < 3; i++) { 9236 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9237 9238 alu.op = ALU_OP1_LOG_IEEE; 9239 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9240 r600_bytecode_src_set_abs(&alu.src[0]); 9241 9242 alu.dst.sel = ctx->temp_reg; 9243 alu.dst.chan = i; 9244 if (i == 1) 9245 alu.dst.write = 1; 9246 if (i == 2) 9247 alu.last = 1; 9248 9249 r = r600_bytecode_add_alu(ctx->bc, &alu); 9250 if (r) 9251 return r; 9252 } 9253 } else { 9254 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9255 9256 alu.op = ALU_OP1_LOG_IEEE; 9257 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9258 r600_bytecode_src_set_abs(&alu.src[0]); 9259 9260 alu.dst.sel = ctx->temp_reg; 9261 alu.dst.chan = 1; 9262 alu.dst.write = 1; 9263 alu.last = 1; 9264 9265 r = r600_bytecode_add_alu(ctx->bc, &alu); 9266 if (r) 9267 return r; 9268 } 9269 9270 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9271 9272 alu.op = ALU_OP1_FLOOR; 9273 alu.src[0].sel = ctx->temp_reg; 9274 alu.src[0].chan = 1; 9275 9276 alu.dst.sel = ctx->temp_reg; 9277 alu.dst.chan = 1; 9278 alu.dst.write = 1; 9279 alu.last = 1; 9280 9281 r = r600_bytecode_add_alu(ctx->bc, &alu); 9282 if (r) 9283 return r; 9284 9285 if (ctx->bc->chip_class == CAYMAN) { 9286 for (i = 0; i < 3; i++) { 9287 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9288 alu.op = ALU_OP1_EXP_IEEE; 9289 alu.src[0].sel = ctx->temp_reg; 9290 alu.src[0].chan = 1; 9291 9292 alu.dst.sel = ctx->temp_reg; 9293 alu.dst.chan = i; 9294 if (i == 1) 9295 alu.dst.write = 1; 9296 if (i == 2) 9297 alu.last = 1; 9298 9299 r = r600_bytecode_add_alu(ctx->bc, &alu); 9300 if (r) 9301 return r; 9302 } 9303 } else { 9304 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9305 alu.op = ALU_OP1_EXP_IEEE; 9306 alu.src[0].sel = ctx->temp_reg; 9307 alu.src[0].chan = 1; 9308 9309 alu.dst.sel = ctx->temp_reg; 9310 alu.dst.chan = 1; 9311 alu.dst.write = 1; 9312 alu.last = 1; 9313 9314 r = r600_bytecode_add_alu(ctx->bc, &alu); 9315 if (r) 9316 return r; 9317 } 9318 9319 if (ctx->bc->chip_class == CAYMAN) { 9320 for (i = 0; i < 3; i++) { 9321 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9322 alu.op = ALU_OP1_RECIP_IEEE; 9323 alu.src[0].sel = ctx->temp_reg; 9324 alu.src[0].chan = 1; 9325 9326 alu.dst.sel = ctx->temp_reg; 9327 alu.dst.chan = i; 9328 if (i == 1) 9329 alu.dst.write = 1; 9330 if (i == 2) 9331 alu.last = 1; 9332 9333 r = r600_bytecode_add_alu(ctx->bc, &alu); 9334 if (r) 9335 return r; 9336 } 9337 } else { 9338 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9339 alu.op = ALU_OP1_RECIP_IEEE; 9340 alu.src[0].sel = ctx->temp_reg; 9341 alu.src[0].chan = 1; 9342 9343 alu.dst.sel = ctx->temp_reg; 9344 alu.dst.chan = 1; 9345 alu.dst.write = 1; 9346 alu.last = 1; 9347 9348 r = r600_bytecode_add_alu(ctx->bc, &alu); 9349 if (r) 9350 return r; 9351 } 9352 9353 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9354 9355 alu.op = ALU_OP2_MUL; 9356 9357 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9358 r600_bytecode_src_set_abs(&alu.src[0]); 9359 9360 alu.src[1].sel = ctx->temp_reg; 9361 alu.src[1].chan = 1; 9362 9363 alu.dst.sel = ctx->temp_reg; 9364 alu.dst.chan = 1; 9365 alu.dst.write = 1; 9366 alu.last = 1; 9367 9368 r = r600_bytecode_add_alu(ctx->bc, &alu); 9369 if (r) 9370 return r; 9371 } 9372 9373 /* result.z = log2(|src|);*/ 9374 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) { 9375 if (ctx->bc->chip_class == CAYMAN) { 9376 for (i = 0; i < 3; i++) { 9377 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9378 9379 alu.op = ALU_OP1_LOG_IEEE; 9380 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9381 r600_bytecode_src_set_abs(&alu.src[0]); 9382 9383 alu.dst.sel = ctx->temp_reg; 9384 if (i == 2) 9385 alu.dst.write = 1; 9386 alu.dst.chan = i; 9387 if (i == 2) 9388 alu.last = 1; 9389 9390 r = r600_bytecode_add_alu(ctx->bc, &alu); 9391 if (r) 9392 return r; 9393 } 9394 } else { 9395 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9396 9397 alu.op = ALU_OP1_LOG_IEEE; 9398 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9399 r600_bytecode_src_set_abs(&alu.src[0]); 9400 9401 alu.dst.sel = ctx->temp_reg; 9402 alu.dst.write = 1; 9403 alu.dst.chan = 2; 9404 alu.last = 1; 9405 9406 r = r600_bytecode_add_alu(ctx->bc, &alu); 9407 if (r) 9408 return r; 9409 } 9410 } 9411 9412 /* result.w = 1.0; */ 9413 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) { 9414 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9415 9416 alu.op = ALU_OP1_MOV; 9417 alu.src[0].sel = V_SQ_ALU_SRC_1; 9418 alu.src[0].chan = 0; 9419 9420 alu.dst.sel = ctx->temp_reg; 9421 alu.dst.chan = 3; 9422 alu.dst.write = 1; 9423 alu.last = 1; 9424 9425 r = r600_bytecode_add_alu(ctx->bc, &alu); 9426 if (r) 9427 return r; 9428 } 9429 9430 return tgsi_helper_copy(ctx, inst); 9431 } 9432 9433 static int tgsi_eg_arl(struct r600_shader_ctx *ctx) 9434 { 9435 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9436 struct r600_bytecode_alu alu; 9437 int r; 9438 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9439 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index); 9440 9441 assert(inst->Dst[0].Register.Index < 3); 9442 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9443 9444 switch (inst->Instruction.Opcode) { 9445 case TGSI_OPCODE_ARL: 9446 alu.op = ALU_OP1_FLT_TO_INT_FLOOR; 9447 break; 9448 case TGSI_OPCODE_ARR: 9449 alu.op = ALU_OP1_FLT_TO_INT; 9450 break; 9451 case TGSI_OPCODE_UARL: 9452 alu.op = ALU_OP1_MOV; 9453 break; 9454 default: 9455 assert(0); 9456 return -1; 9457 } 9458 9459 for (i = 0; i <= lasti; ++i) { 9460 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 9461 continue; 9462 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9463 alu.last = i == lasti; 9464 alu.dst.sel = reg; 9465 alu.dst.chan = i; 9466 alu.dst.write = 1; 9467 r = r600_bytecode_add_alu(ctx->bc, &alu); 9468 if (r) 9469 return r; 9470 } 9471 9472 if (inst->Dst[0].Register.Index > 0) 9473 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0; 9474 else 9475 ctx->bc->ar_loaded = 0; 9476 9477 return 0; 9478 } 9479 static int tgsi_r600_arl(struct r600_shader_ctx *ctx) 9480 { 9481 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9482 struct r600_bytecode_alu alu; 9483 int r; 9484 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9485 9486 switch (inst->Instruction.Opcode) { 9487 case TGSI_OPCODE_ARL: 9488 memset(&alu, 0, sizeof(alu)); 9489 alu.op = ALU_OP1_FLOOR; 9490 alu.dst.sel = ctx->bc->ar_reg; 9491 alu.dst.write = 1; 9492 for (i = 0; i <= lasti; ++i) { 9493 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 9494 alu.dst.chan = i; 9495 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9496 alu.last = i == lasti; 9497 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 9498 return r; 9499 } 9500 } 9501 9502 memset(&alu, 0, sizeof(alu)); 9503 alu.op = ALU_OP1_FLT_TO_INT; 9504 alu.src[0].sel = ctx->bc->ar_reg; 9505 alu.dst.sel = ctx->bc->ar_reg; 9506 alu.dst.write = 1; 9507 /* FLT_TO_INT is trans-only on r600/r700 */ 9508 alu.last = TRUE; 9509 for (i = 0; i <= lasti; ++i) { 9510 alu.dst.chan = i; 9511 alu.src[0].chan = i; 9512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 9513 return r; 9514 } 9515 break; 9516 case TGSI_OPCODE_ARR: 9517 memset(&alu, 0, sizeof(alu)); 9518 alu.op = ALU_OP1_FLT_TO_INT; 9519 alu.dst.sel = ctx->bc->ar_reg; 9520 alu.dst.write = 1; 9521 /* FLT_TO_INT is trans-only on r600/r700 */ 9522 alu.last = TRUE; 9523 for (i = 0; i <= lasti; ++i) { 9524 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 9525 alu.dst.chan = i; 9526 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9527 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 9528 return r; 9529 } 9530 } 9531 break; 9532 case TGSI_OPCODE_UARL: 9533 memset(&alu, 0, sizeof(alu)); 9534 alu.op = ALU_OP1_MOV; 9535 alu.dst.sel = ctx->bc->ar_reg; 9536 alu.dst.write = 1; 9537 for (i = 0; i <= lasti; ++i) { 9538 if (inst->Dst[0].Register.WriteMask & (1 << i)) { 9539 alu.dst.chan = i; 9540 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9541 alu.last = i == lasti; 9542 if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) 9543 return r; 9544 } 9545 } 9546 break; 9547 default: 9548 assert(0); 9549 return -1; 9550 } 9551 9552 ctx->bc->ar_loaded = 0; 9553 return 0; 9554 } 9555 9556 static int tgsi_opdst(struct r600_shader_ctx *ctx) 9557 { 9558 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9559 struct r600_bytecode_alu alu; 9560 int i, r = 0; 9561 9562 for (i = 0; i < 4; i++) { 9563 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9564 9565 alu.op = ALU_OP2_MUL; 9566 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 9567 9568 if (i == 0 || i == 3) { 9569 alu.src[0].sel = V_SQ_ALU_SRC_1; 9570 } else { 9571 r600_bytecode_src(&alu.src[0], &ctx->src[0], i); 9572 } 9573 9574 if (i == 0 || i == 2) { 9575 alu.src[1].sel = V_SQ_ALU_SRC_1; 9576 } else { 9577 r600_bytecode_src(&alu.src[1], &ctx->src[1], i); 9578 } 9579 if (i == 3) 9580 alu.last = 1; 9581 r = r600_bytecode_add_alu(ctx->bc, &alu); 9582 if (r) 9583 return r; 9584 } 9585 return 0; 9586 } 9587 9588 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type) 9589 { 9590 struct r600_bytecode_alu alu; 9591 int r; 9592 9593 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 9594 alu.op = opcode; 9595 alu.execute_mask = 1; 9596 alu.update_pred = 1; 9597 9598 alu.dst.sel = ctx->temp_reg; 9599 alu.dst.write = 1; 9600 alu.dst.chan = 0; 9601 9602 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 9603 alu.src[1].sel = V_SQ_ALU_SRC_0; 9604 alu.src[1].chan = 0; 9605 9606 alu.last = 1; 9607 9608 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type); 9609 if (r) 9610 return r; 9611 return 0; 9612 } 9613 9614 static int pops(struct r600_shader_ctx *ctx, int pops) 9615 { 9616 unsigned force_pop = ctx->bc->force_add_cf; 9617 9618 if (!force_pop) { 9619 int alu_pop = 3; 9620 if (ctx->bc->cf_last) { 9621 if (ctx->bc->cf_last->op == CF_OP_ALU) 9622 alu_pop = 0; 9623 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER) 9624 alu_pop = 1; 9625 } 9626 alu_pop += pops; 9627 if (alu_pop == 1) { 9628 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER; 9629 ctx->bc->force_add_cf = 1; 9630 } else if (alu_pop == 2) { 9631 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER; 9632 ctx->bc->force_add_cf = 1; 9633 } else { 9634 force_pop = 1; 9635 } 9636 } 9637 9638 if (force_pop) { 9639 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); 9640 ctx->bc->cf_last->pop_count = pops; 9641 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 9642 } 9643 9644 return 0; 9645 } 9646 9647 static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx, 9648 unsigned reason) 9649 { 9650 struct r600_stack_info *stack = &ctx->bc->stack; 9651 unsigned elements; 9652 int entries; 9653 9654 unsigned entry_size = stack->entry_size; 9655 9656 elements = (stack->loop + stack->push_wqm ) * entry_size; 9657 elements += stack->push; 9658 9659 switch (ctx->bc->chip_class) { 9660 case R600: 9661 case R700: 9662 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on 9663 * the stack must be reserved to hold the current active/continue 9664 * masks */ 9665 if (reason == FC_PUSH_VPM || stack->push > 0) { 9666 elements += 2; 9667 } 9668 break; 9669 9670 case CAYMAN: 9671 /* r9xx: any stack operation on empty stack consumes 2 additional 9672 * elements */ 9673 elements += 2; 9674 9675 /* fallthrough */ 9676 /* FIXME: do the two elements added above cover the cases for the 9677 * r8xx+ below? */ 9678 9679 case EVERGREEN: 9680 /* r8xx+: 2 extra elements are not always required, but one extra 9681 * element must be added for each of the following cases: 9682 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest 9683 * stack usage. 9684 * (Currently we don't use ALU_ELSE_AFTER.) 9685 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM 9686 * PUSH instruction executed. 9687 * 9688 * NOTE: it seems we also need to reserve additional element in some 9689 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader, 9690 * then STACK_SIZE should be 2 instead of 1 */ 9691 if (reason == FC_PUSH_VPM || stack->push > 0) { 9692 elements += 1; 9693 } 9694 break; 9695 9696 default: 9697 assert(0); 9698 break; 9699 } 9700 9701 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4 9702 * for all chips, so we use 4 in the final formula, not the real entry_size 9703 * for the chip */ 9704 entry_size = 4; 9705 9706 entries = (elements + (entry_size - 1)) / entry_size; 9707 9708 if (entries > stack->max_entries) 9709 stack->max_entries = entries; 9710 return elements; 9711 } 9712 9713 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason) 9714 { 9715 switch(reason) { 9716 case FC_PUSH_VPM: 9717 --ctx->bc->stack.push; 9718 assert(ctx->bc->stack.push >= 0); 9719 break; 9720 case FC_PUSH_WQM: 9721 --ctx->bc->stack.push_wqm; 9722 assert(ctx->bc->stack.push_wqm >= 0); 9723 break; 9724 case FC_LOOP: 9725 --ctx->bc->stack.loop; 9726 assert(ctx->bc->stack.loop >= 0); 9727 break; 9728 default: 9729 assert(0); 9730 break; 9731 } 9732 } 9733 9734 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason) 9735 { 9736 switch (reason) { 9737 case FC_PUSH_VPM: 9738 ++ctx->bc->stack.push; 9739 break; 9740 case FC_PUSH_WQM: 9741 ++ctx->bc->stack.push_wqm; 9742 break; 9743 case FC_LOOP: 9744 ++ctx->bc->stack.loop; 9745 break; 9746 default: 9747 assert(0); 9748 } 9749 9750 return callstack_update_max_depth(ctx, reason); 9751 } 9752 9753 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) 9754 { 9755 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; 9756 9757 sp->mid = realloc((void *)sp->mid, 9758 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1)); 9759 sp->mid[sp->num_mid] = ctx->bc->cf_last; 9760 sp->num_mid++; 9761 } 9762 9763 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) 9764 { 9765 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack)); 9766 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; 9767 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; 9768 ctx->bc->fc_sp++; 9769 } 9770 9771 static void fc_poplevel(struct r600_shader_ctx *ctx) 9772 { 9773 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1]; 9774 free(sp->mid); 9775 sp->mid = NULL; 9776 sp->num_mid = 0; 9777 sp->start = NULL; 9778 sp->type = 0; 9779 ctx->bc->fc_sp--; 9780 } 9781 9782 #if 0 9783 static int emit_return(struct r600_shader_ctx *ctx) 9784 { 9785 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN)); 9786 return 0; 9787 } 9788 9789 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) 9790 { 9791 9792 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP)); 9793 ctx->bc->cf_last->pop_count = pops; 9794 /* XXX work out offset */ 9795 return 0; 9796 } 9797 9798 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) 9799 { 9800 return 0; 9801 } 9802 9803 static void emit_testflag(struct r600_shader_ctx *ctx) 9804 { 9805 9806 } 9807 9808 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) 9809 { 9810 emit_testflag(ctx); 9811 emit_jump_to_offset(ctx, 1, 4); 9812 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); 9813 pops(ctx, ifidx + 1); 9814 emit_return(ctx); 9815 } 9816 9817 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) 9818 { 9819 emit_testflag(ctx); 9820 9821 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 9822 ctx->bc->cf_last->pop_count = 1; 9823 9824 fc_set_mid(ctx, fc_sp); 9825 9826 pops(ctx, 1); 9827 } 9828 #endif 9829 9830 static int emit_if(struct r600_shader_ctx *ctx, int opcode) 9831 { 9832 int alu_type = CF_OP_ALU_PUSH_BEFORE; 9833 bool needs_workaround = false; 9834 int elems = callstack_push(ctx, FC_PUSH_VPM); 9835 9836 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) 9837 needs_workaround = true; 9838 9839 if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) { 9840 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size; 9841 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size; 9842 9843 if (elems && (!dmod1 || !dmod2)) 9844 needs_workaround = true; 9845 } 9846 9847 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by 9848 * LOOP_STARTxxx for nested loops may put the branch stack into a state 9849 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this 9850 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */ 9851 if (needs_workaround) { 9852 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH); 9853 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2; 9854 alu_type = CF_OP_ALU; 9855 } 9856 9857 emit_logic_pred(ctx, opcode, alu_type); 9858 9859 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); 9860 9861 fc_pushlevel(ctx, FC_IF); 9862 9863 return 0; 9864 } 9865 9866 static int tgsi_if(struct r600_shader_ctx *ctx) 9867 { 9868 return emit_if(ctx, ALU_OP2_PRED_SETNE); 9869 } 9870 9871 static int tgsi_uif(struct r600_shader_ctx *ctx) 9872 { 9873 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT); 9874 } 9875 9876 static int tgsi_else(struct r600_shader_ctx *ctx) 9877 { 9878 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE); 9879 ctx->bc->cf_last->pop_count = 1; 9880 9881 fc_set_mid(ctx, ctx->bc->fc_sp - 1); 9882 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id; 9883 return 0; 9884 } 9885 9886 static int tgsi_endif(struct r600_shader_ctx *ctx) 9887 { 9888 int offset = 2; 9889 pops(ctx, 1); 9890 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) { 9891 R600_ERR("if/endif unbalanced in shader\n"); 9892 return -1; 9893 } 9894 9895 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */ 9896 if (ctx->bc->cf_last->eg_alu_extended) 9897 offset += 2; 9898 9899 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) { 9900 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset; 9901 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1; 9902 } else { 9903 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset; 9904 } 9905 fc_poplevel(ctx); 9906 9907 callstack_pop(ctx, FC_PUSH_VPM); 9908 return 0; 9909 } 9910 9911 static int tgsi_bgnloop(struct r600_shader_ctx *ctx) 9912 { 9913 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not 9914 * limited to 4096 iterations, like the other LOOP_* instructions. */ 9915 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10); 9916 9917 fc_pushlevel(ctx, FC_LOOP); 9918 9919 /* check stack depth */ 9920 callstack_push(ctx, FC_LOOP); 9921 return 0; 9922 } 9923 9924 static int tgsi_endloop(struct r600_shader_ctx *ctx) 9925 { 9926 int i; 9927 9928 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); 9929 9930 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) { 9931 R600_ERR("loop/endloop in shader code are not paired.\n"); 9932 return -EINVAL; 9933 } 9934 9935 /* fixup loop pointers - from r600isa 9936 LOOP END points to CF after LOOP START, 9937 LOOP START point to CF after LOOP END 9938 BRK/CONT point to LOOP END CF 9939 */ 9940 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2; 9941 9942 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2; 9943 9944 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) { 9945 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id; 9946 } 9947 /* XXX add LOOPRET support */ 9948 fc_poplevel(ctx); 9949 callstack_pop(ctx, FC_LOOP); 9950 return 0; 9951 } 9952 9953 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) 9954 { 9955 unsigned int fscp; 9956 9957 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) 9958 { 9959 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type) 9960 break; 9961 } 9962 9963 if (fscp == 0) { 9964 R600_ERR("Break not inside loop/endloop pair\n"); 9965 return -EINVAL; 9966 } 9967 9968 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 9969 9970 fc_set_mid(ctx, fscp - 1); 9971 9972 return 0; 9973 } 9974 9975 static int tgsi_gs_emit(struct r600_shader_ctx *ctx) 9976 { 9977 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9978 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX]; 9979 int r; 9980 9981 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 9982 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE); 9983 9984 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); 9985 if (!r) { 9986 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream 9987 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) 9988 return emit_inc_ring_offset(ctx, stream, TRUE); 9989 } 9990 return r; 9991 } 9992 9993 static int tgsi_umad(struct r600_shader_ctx *ctx) 9994 { 9995 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 9996 struct r600_bytecode_alu alu; 9997 int i, j, k, r; 9998 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 9999 10000 /* src0 * src1 */ 10001 for (i = 0; i < lasti + 1; i++) { 10002 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10003 continue; 10004 10005 if (ctx->bc->chip_class == CAYMAN) { 10006 for (j = 0 ; j < 4; j++) { 10007 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10008 10009 alu.op = ALU_OP2_MULLO_UINT; 10010 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) { 10011 r600_bytecode_src(&alu.src[k], &ctx->src[k], i); 10012 } 10013 alu.dst.chan = j; 10014 alu.dst.sel = ctx->temp_reg; 10015 alu.dst.write = (j == i); 10016 if (j == 3) 10017 alu.last = 1; 10018 r = r600_bytecode_add_alu(ctx->bc, &alu); 10019 if (r) 10020 return r; 10021 } 10022 } else { 10023 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10024 10025 alu.dst.chan = i; 10026 alu.dst.sel = ctx->temp_reg; 10027 alu.dst.write = 1; 10028 10029 alu.op = ALU_OP2_MULLO_UINT; 10030 for (j = 0; j < 2; j++) { 10031 r600_bytecode_src(&alu.src[j], &ctx->src[j], i); 10032 } 10033 10034 alu.last = 1; 10035 r = r600_bytecode_add_alu(ctx->bc, &alu); 10036 if (r) 10037 return r; 10038 } 10039 } 10040 10041 10042 for (i = 0; i < lasti + 1; i++) { 10043 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10044 continue; 10045 10046 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10047 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10048 10049 alu.op = ALU_OP2_ADD_INT; 10050 10051 alu.src[0].sel = ctx->temp_reg; 10052 alu.src[0].chan = i; 10053 10054 r600_bytecode_src(&alu.src[1], &ctx->src[2], i); 10055 if (i == lasti) { 10056 alu.last = 1; 10057 } 10058 r = r600_bytecode_add_alu(ctx->bc, &alu); 10059 if (r) 10060 return r; 10061 } 10062 return 0; 10063 } 10064 10065 static int tgsi_pk2h(struct r600_shader_ctx *ctx) 10066 { 10067 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10068 struct r600_bytecode_alu alu; 10069 int r, i; 10070 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10071 10072 /* temp.xy = f32_to_f16(src) */ 10073 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10074 alu.op = ALU_OP1_FLT32_TO_FLT16; 10075 alu.dst.chan = 0; 10076 alu.dst.sel = ctx->temp_reg; 10077 alu.dst.write = 1; 10078 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10079 r = r600_bytecode_add_alu(ctx->bc, &alu); 10080 if (r) 10081 return r; 10082 alu.dst.chan = 1; 10083 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); 10084 alu.last = 1; 10085 r = r600_bytecode_add_alu(ctx->bc, &alu); 10086 if (r) 10087 return r; 10088 10089 /* dst.x = temp.y * 0x10000 + temp.x */ 10090 for (i = 0; i < lasti + 1; i++) { 10091 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10092 continue; 10093 10094 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10095 alu.op = ALU_OP3_MULADD_UINT24; 10096 alu.is_op3 = 1; 10097 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10098 alu.last = i == lasti; 10099 alu.src[0].sel = ctx->temp_reg; 10100 alu.src[0].chan = 1; 10101 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10102 alu.src[1].value = 0x10000; 10103 alu.src[2].sel = ctx->temp_reg; 10104 alu.src[2].chan = 0; 10105 r = r600_bytecode_add_alu(ctx->bc, &alu); 10106 if (r) 10107 return r; 10108 } 10109 10110 return 0; 10111 } 10112 10113 static int tgsi_up2h(struct r600_shader_ctx *ctx) 10114 { 10115 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10116 struct r600_bytecode_alu alu; 10117 int r, i; 10118 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10119 10120 /* temp.x = src.x */ 10121 /* note: no need to mask out the high bits */ 10122 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10123 alu.op = ALU_OP1_MOV; 10124 alu.dst.chan = 0; 10125 alu.dst.sel = ctx->temp_reg; 10126 alu.dst.write = 1; 10127 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10128 r = r600_bytecode_add_alu(ctx->bc, &alu); 10129 if (r) 10130 return r; 10131 10132 /* temp.y = src.x >> 16 */ 10133 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10134 alu.op = ALU_OP2_LSHR_INT; 10135 alu.dst.chan = 1; 10136 alu.dst.sel = ctx->temp_reg; 10137 alu.dst.write = 1; 10138 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); 10139 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10140 alu.src[1].value = 16; 10141 alu.last = 1; 10142 r = r600_bytecode_add_alu(ctx->bc, &alu); 10143 if (r) 10144 return r; 10145 10146 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */ 10147 for (i = 0; i < lasti + 1; i++) { 10148 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) 10149 continue; 10150 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10151 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10152 alu.op = ALU_OP1_FLT16_TO_FLT32; 10153 alu.src[0].sel = ctx->temp_reg; 10154 alu.src[0].chan = i % 2; 10155 alu.last = i == lasti; 10156 r = r600_bytecode_add_alu(ctx->bc, &alu); 10157 if (r) 10158 return r; 10159 } 10160 10161 return 0; 10162 } 10163 10164 static int tgsi_bfe(struct r600_shader_ctx *ctx) 10165 { 10166 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10167 struct r600_bytecode_alu alu; 10168 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); 10169 int r, i; 10170 int dst = -1; 10171 10172 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File && 10173 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) || 10174 (inst->Src[2].Register.File == inst->Dst[0].Register.File && 10175 inst->Src[2].Register.Index == inst->Dst[0].Register.Index)) 10176 dst = r600_get_temp(ctx); 10177 10178 r = tgsi_op3_dst(ctx, dst); 10179 if (r) 10180 return r; 10181 10182 for (i = 0; i < lasti + 1; i++) { 10183 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10184 alu.op = ALU_OP2_SETGE_INT; 10185 r600_bytecode_src(&alu.src[0], &ctx->src[2], i); 10186 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 10187 alu.src[1].value = 32; 10188 alu.dst.sel = ctx->temp_reg; 10189 alu.dst.chan = i; 10190 alu.dst.write = 1; 10191 if (i == lasti) 10192 alu.last = 1; 10193 r = r600_bytecode_add_alu(ctx->bc, &alu); 10194 if (r) 10195 return r; 10196 } 10197 10198 for (i = 0; i < lasti + 1; i++) { 10199 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10200 alu.op = ALU_OP3_CNDE_INT; 10201 alu.is_op3 = 1; 10202 alu.src[0].sel = ctx->temp_reg; 10203 alu.src[0].chan = i; 10204 10205 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); 10206 if (dst != -1) 10207 alu.src[1].sel = dst; 10208 else 10209 alu.src[1].sel = alu.dst.sel; 10210 alu.src[1].chan = i; 10211 r600_bytecode_src(&alu.src[2], &ctx->src[0], i); 10212 alu.dst.write = 1; 10213 if (i == lasti) 10214 alu.last = 1; 10215 r = r600_bytecode_add_alu(ctx->bc, &alu); 10216 if (r) 10217 return r; 10218 } 10219 10220 return 0; 10221 } 10222 10223 static int tgsi_clock(struct r600_shader_ctx *ctx) 10224 { 10225 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; 10226 struct r600_bytecode_alu alu; 10227 int r; 10228 10229 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10230 alu.op = ALU_OP1_MOV; 10231 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); 10232 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO; 10233 r = r600_bytecode_add_alu(ctx->bc, &alu); 10234 if (r) 10235 return r; 10236 memset(&alu, 0, sizeof(struct r600_bytecode_alu)); 10237 alu.op = ALU_OP1_MOV; 10238 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); 10239 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI; 10240 alu.last = 1; 10241 r = r600_bytecode_add_alu(ctx->bc, &alu); 10242 if (r) 10243 return r; 10244 return 0; 10245 } 10246 10247 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { 10248 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl}, 10249 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 10250 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 10251 10252 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 10253 10254 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 10255 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 10256 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 10257 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 10258 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 10259 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10260 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10261 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 10262 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */ 10263 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 10264 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 10265 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 10266 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 10267 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 10268 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 10269 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, 10270 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 10271 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 10272 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 10273 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 10274 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 10275 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 10276 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 10277 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 10278 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 10279 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 10280 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 10281 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 10282 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 10283 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported}, 10284 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 10285 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 10286 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 10287 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 10288 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 10289 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 10290 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported}, 10291 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 10292 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 10293 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 10294 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 10295 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 10296 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 10297 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 10298 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 10299 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 10300 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 10301 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 10302 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 10303 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 10304 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 10305 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported}, 10306 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 10307 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 10308 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 10309 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 10310 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 10311 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl}, 10312 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 10313 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 10314 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 10315 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 10316 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 10317 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 10318 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 10319 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 10320 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 10321 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10322 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 10323 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 10324 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 10325 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 10326 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 10327 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 10328 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 10329 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 10330 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported}, 10331 [81] = { ALU_OP0_NOP, tgsi_unsupported}, 10332 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 10333 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 10334 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 10335 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 10336 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 10337 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans}, 10338 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 10339 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 10340 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 10341 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 10342 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 10343 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 10344 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 10345 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 10346 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 10347 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 10348 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 10349 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 10350 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 10351 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 10352 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 10353 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 10354 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 10355 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, 10356 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 10357 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 10358 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 10359 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 10360 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 10361 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 10362 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, 10363 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 10364 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 10365 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 10366 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 10367 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 10368 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported}, 10369 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, 10370 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 10371 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 10372 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 10373 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 10374 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 10375 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans}, 10376 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 10377 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans}, 10378 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 10379 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 10380 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 10381 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 10382 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 10383 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 10384 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 10385 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 10386 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 10387 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 10388 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans}, 10389 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 10390 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap}, 10391 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 10392 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 10393 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 10394 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 10395 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 10396 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 10397 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 10398 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 10399 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 10400 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 10401 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 10402 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 10403 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 10404 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 10405 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 10406 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 10407 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl}, 10408 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 10409 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 10410 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 10411 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported}, 10412 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported}, 10413 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 10414 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 10415 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 10416 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported}, 10417 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, 10418 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, 10419 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, 10420 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported}, 10421 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported}, 10422 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported}, 10423 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 10424 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 10425 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported}, 10426 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported}, 10427 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 10428 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 10429 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 10430 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 10431 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 10432 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported}, 10433 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported}, 10434 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported}, 10435 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported}, 10436 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported}, 10437 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported}, 10438 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported}, 10439 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported}, 10440 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported}, 10441 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported}, 10442 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported}, 10443 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported}, 10444 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported}, 10445 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 10446 }; 10447 10448 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = { 10449 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 10450 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 10451 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 10452 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate}, 10453 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq}, 10454 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 10455 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 10456 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 10457 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 10458 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10459 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10460 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 10461 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 10462 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 10463 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 10464 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 10465 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 10466 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 10467 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 10468 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, 10469 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 10470 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 10471 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 10472 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 10473 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 10474 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 10475 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 10476 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate}, 10477 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate}, 10478 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, 10479 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 10480 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 10481 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 10482 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 10483 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 10484 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, 10485 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 10486 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 10487 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 10488 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 10489 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 10490 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 10491 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 10492 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 10493 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 10494 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 10495 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 10496 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig}, 10497 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 10498 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 10499 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 10500 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 10501 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 10502 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 10503 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 10504 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 10505 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 10506 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 10507 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 10508 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 10509 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 10510 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 10511 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 10512 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 10513 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 10514 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 10515 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 10516 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 10517 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 10518 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 10519 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10520 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 10521 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 10522 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 10523 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 10524 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 10525 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 10526 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 10527 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 10528 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 10529 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 10530 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 10531 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans}, 10532 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 10533 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 10534 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 10535 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 10536 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 10537 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 10538 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 10539 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 10540 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 10541 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 10542 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 10543 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 10544 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 10545 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 10546 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 10547 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 10548 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 10549 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 10550 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 10551 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 10552 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 10553 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 10554 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 10555 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 10556 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 10557 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 10558 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 10559 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 10560 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 10561 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 10562 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 10563 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 10564 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 10565 /* Refer below for TGSI_OPCODE_DFMA */ 10566 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i}, 10567 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 10568 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 10569 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 10570 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 10571 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 10572 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 10573 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 10574 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i}, 10575 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans}, 10576 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 10577 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 10578 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 10579 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 10580 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 10581 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 10582 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans}, 10583 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 10584 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 10585 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 10586 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 10587 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 10588 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 10589 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 10590 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 10591 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 10592 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 10593 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 10594 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 10595 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 10596 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 10597 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 10598 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 10599 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 10600 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 10601 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 10602 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 10603 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 10604 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 10605 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 10606 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 10607 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 10608 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 10609 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 10610 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 10611 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 10612 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 10613 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 10614 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 10615 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 10616 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 10617 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 10618 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 10619 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 10620 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 10621 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 10622 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 10623 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 10624 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 10625 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 10626 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 10627 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans}, 10628 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans}, 10629 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 10630 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 10631 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 10632 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 10633 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 10634 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 10635 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 10636 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 10637 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 10638 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 10639 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 10640 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 10641 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 10642 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 10643 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 10644 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 10645 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 10646 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 10647 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 10648 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 10649 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 10650 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 10651 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 10652 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 10653 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 10654 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 10655 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 10656 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 10657 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 10658 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 10659 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 10660 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 10661 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 10662 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 10663 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 10664 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 10665 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 10666 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 10667 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 10668 }; 10669 10670 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = { 10671 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl}, 10672 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, 10673 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit}, 10674 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr}, 10675 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr}, 10676 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp}, 10677 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log}, 10678 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2}, 10679 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2}, 10680 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10681 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10682 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst}, 10683 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2}, 10684 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2}, 10685 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, 10686 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, 10687 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3}, 10688 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, 10689 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, 10690 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr}, 10691 [21] = { ALU_OP0_NOP, tgsi_unsupported}, 10692 [22] = { ALU_OP0_NOP, tgsi_unsupported}, 10693 [23] = { ALU_OP0_NOP, tgsi_unsupported}, 10694 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2}, 10695 [25] = { ALU_OP0_NOP, tgsi_unsupported}, 10696 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2}, 10697 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2}, 10698 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr}, 10699 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr}, 10700 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow}, 10701 [31] = { ALU_OP0_NOP, tgsi_unsupported}, 10702 [32] = { ALU_OP0_NOP, tgsi_unsupported}, 10703 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock}, 10704 [34] = { ALU_OP0_NOP, tgsi_unsupported}, 10705 [35] = { ALU_OP0_NOP, tgsi_unsupported}, 10706 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig}, 10707 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 10708 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 10709 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ 10710 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, 10711 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, 10712 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, 10713 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 10714 [44] = { ALU_OP0_NOP, tgsi_unsupported}, 10715 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2}, 10716 [46] = { ALU_OP0_NOP, tgsi_unsupported}, 10717 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2}, 10718 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig}, 10719 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap}, 10720 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2}, 10721 [51] = { ALU_OP0_NOP, tgsi_unsupported}, 10722 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, 10723 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, 10724 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, 10725 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, 10726 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, 10727 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, 10728 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, 10729 [59] = { ALU_OP0_NOP, tgsi_unsupported}, 10730 [60] = { ALU_OP0_NOP, tgsi_unsupported}, 10731 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl}, 10732 [62] = { ALU_OP0_NOP, tgsi_unsupported}, 10733 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported}, 10734 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported}, 10735 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg}, 10736 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp}, 10737 [67] = { ALU_OP0_NOP, tgsi_unsupported}, 10738 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 10739 [69] = { ALU_OP0_NOP, tgsi_unsupported}, 10740 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported}, 10741 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp}, 10742 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 10743 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont}, 10744 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if}, 10745 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif}, 10746 [76] = { ALU_OP0_NOP, tgsi_unsupported}, 10747 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else}, 10748 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif}, 10749 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, 10750 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, 10751 [82] = { ALU_OP0_NOP, tgsi_unsupported}, 10752 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2}, 10753 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2}, 10754 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2}, 10755 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2}, 10756 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2}, 10757 [88] = { ALU_OP0_NOP, tgsi_unsupported}, 10758 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2}, 10759 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2}, 10760 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod}, 10761 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2}, 10762 [93] = { ALU_OP0_NOP, tgsi_unsupported}, 10763 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex}, 10764 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 10765 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont}, 10766 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit}, 10767 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit}, 10768 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop}, 10769 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 10770 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, 10771 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, 10772 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, 10773 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, 10774 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq}, 10775 [106] = { ALU_OP0_NOP, tgsi_unsupported}, 10776 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, 10777 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, 10778 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, 10779 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, 10780 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, 10781 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 10782 [113] = { ALU_OP0_NOP, tgsi_unsupported}, 10783 [114] = { ALU_OP0_NOP, tgsi_unsupported}, 10784 [115] = { ALU_OP0_NOP, tgsi_unsupported}, 10785 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ 10786 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ 10787 /* Refer below for TGSI_OPCODE_DFMA */ 10788 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2}, 10789 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, 10790 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, 10791 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2}, 10792 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg}, 10793 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2}, 10794 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2}, 10795 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap}, 10796 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2}, 10797 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2}, 10798 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2}, 10799 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv}, 10800 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad}, 10801 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2}, 10802 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2}, 10803 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod}, 10804 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr}, 10805 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2}, 10806 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2}, 10807 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2}, 10808 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap}, 10809 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2}, 10810 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 10811 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported}, 10812 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported}, 10813 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported}, 10814 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported}, 10815 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported}, 10816 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported}, 10817 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported}, 10818 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported}, 10819 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported}, 10820 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported}, 10821 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported}, 10822 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported}, 10823 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported}, 10824 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported}, 10825 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported}, 10826 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl}, 10827 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp}, 10828 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs}, 10829 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg}, 10830 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load}, 10831 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store}, 10832 [163] = { ALU_OP0_NOP, tgsi_unsupported}, 10833 [164] = { ALU_OP0_NOP, tgsi_unsupported}, 10834 [165] = { ALU_OP0_NOP, tgsi_unsupported}, 10835 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, 10836 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op}, 10837 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op}, 10838 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op}, 10839 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op}, 10840 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op}, 10841 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op}, 10842 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op}, 10843 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op}, 10844 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op}, 10845 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op}, 10846 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex}, 10847 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex}, 10848 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex}, 10849 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr}, 10850 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr}, 10851 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex}, 10852 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex}, 10853 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe}, 10854 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe}, 10855 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi}, 10856 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2}, 10857 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2}, 10858 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2}, 10859 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb}, 10860 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb}, 10861 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, 10862 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, 10863 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, 10864 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, 10865 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, 10866 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, 10867 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, 10868 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, 10869 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, 10870 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr }, 10871 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, 10872 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, 10873 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, 10874 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, 10875 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, 10876 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, 10877 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, 10878 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, 10879 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, 10880 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, 10881 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, 10882 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, 10883 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, 10884 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, 10885 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, 10886 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, 10887 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, 10888 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, 10889 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, 10890 }; 10891