1 /* 2 * Copyright 2010 Jerome Glisse <glisse (at) freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23 #include "r600_sq.h" 24 #include "r600_opcodes.h" 25 #include "r600_formats.h" 26 #include "r600_shader.h" 27 #include "r600d.h" 28 29 #include <errno.h> 30 #include "util/u_bitcast.h" 31 #include "util/u_dump.h" 32 #include "util/u_memory.h" 33 #include "util/u_math.h" 34 #include "pipe/p_shader_tokens.h" 35 36 #include "sb/sb_public.h" 37 38 #define NUM_OF_CYCLES 3 39 #define NUM_OF_COMPONENTS 4 40 41 static inline bool alu_writes(struct r600_bytecode_alu *alu) 42 { 43 return alu->dst.write || alu->is_op3; 44 } 45 46 static inline unsigned int r600_bytecode_get_num_operands(const struct r600_bytecode_alu *alu) 47 { 48 return r600_isa_alu(alu->op)->src_count; 49 } 50 51 static struct r600_bytecode_cf *r600_bytecode_cf(void) 52 { 53 struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf); 54 55 if (!cf) 56 return NULL; 57 LIST_INITHEAD(&cf->list); 58 LIST_INITHEAD(&cf->alu); 59 LIST_INITHEAD(&cf->vtx); 60 LIST_INITHEAD(&cf->tex); 61 LIST_INITHEAD(&cf->gds); 62 return cf; 63 } 64 65 static struct r600_bytecode_alu *r600_bytecode_alu(void) 66 { 67 struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu); 68 69 if (!alu) 70 return NULL; 71 LIST_INITHEAD(&alu->list); 72 return alu; 73 } 74 75 static struct r600_bytecode_vtx *r600_bytecode_vtx(void) 76 { 77 struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx); 78 79 if (!vtx) 80 return NULL; 81 LIST_INITHEAD(&vtx->list); 82 return vtx; 83 } 84 85 static struct r600_bytecode_tex *r600_bytecode_tex(void) 86 { 87 struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex); 88 89 if (!tex) 90 return NULL; 91 LIST_INITHEAD(&tex->list); 92 return tex; 93 } 94 95 static struct r600_bytecode_gds *r600_bytecode_gds(void) 96 { 97 struct r600_bytecode_gds *gds = CALLOC_STRUCT(r600_bytecode_gds); 98 99 if (gds == NULL) 100 return NULL; 101 LIST_INITHEAD(&gds->list); 102 return gds; 103 } 104 105 static unsigned stack_entry_size(enum radeon_family chip) { 106 /* Wavefront size: 107 * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/ 108 * Aruba/Sumo/Sumo2/redwood/juniper 109 * 32: R630/R730/R710/Palm/Cedar 110 * 16: R610/Rs780 111 * 112 * Stack row size: 113 * Wavefront Size 16 32 48 64 114 * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4 115 * Columns per Row (R9xx+) 8 4 4 4 */ 116 117 switch (chip) { 118 /* FIXME: are some chips missing here? */ 119 /* wavefront size 16 */ 120 case CHIP_RV610: 121 case CHIP_RS780: 122 case CHIP_RV620: 123 case CHIP_RS880: 124 /* wavefront size 32 */ 125 case CHIP_RV630: 126 case CHIP_RV635: 127 case CHIP_RV730: 128 case CHIP_RV710: 129 case CHIP_PALM: 130 case CHIP_CEDAR: 131 return 8; 132 133 /* wavefront size 64 */ 134 default: 135 return 4; 136 } 137 } 138 139 void r600_bytecode_init(struct r600_bytecode *bc, 140 enum chip_class chip_class, 141 enum radeon_family family, 142 bool has_compressed_msaa_texturing) 143 { 144 static unsigned next_shader_id = 0; 145 146 bc->debug_id = ++next_shader_id; 147 148 if ((chip_class == R600) && 149 (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) { 150 bc->ar_handling = AR_HANDLE_RV6XX; 151 bc->r6xx_nop_after_rel_dst = 1; 152 } else { 153 bc->ar_handling = AR_HANDLE_NORMAL; 154 bc->r6xx_nop_after_rel_dst = 0; 155 } 156 157 LIST_INITHEAD(&bc->cf); 158 bc->chip_class = chip_class; 159 bc->family = family; 160 bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing; 161 bc->stack.entry_size = stack_entry_size(family); 162 } 163 164 int r600_bytecode_add_cf(struct r600_bytecode *bc) 165 { 166 struct r600_bytecode_cf *cf = r600_bytecode_cf(); 167 168 if (!cf) 169 return -ENOMEM; 170 LIST_ADDTAIL(&cf->list, &bc->cf); 171 if (bc->cf_last) { 172 cf->id = bc->cf_last->id + 2; 173 if (bc->cf_last->eg_alu_extended) { 174 /* take into account extended alu size */ 175 cf->id += 2; 176 bc->ndw += 2; 177 } 178 } 179 bc->cf_last = cf; 180 bc->ncf++; 181 bc->ndw += 2; 182 bc->force_add_cf = 0; 183 bc->ar_loaded = 0; 184 return 0; 185 } 186 187 int r600_bytecode_add_output(struct r600_bytecode *bc, 188 const struct r600_bytecode_output *output) 189 { 190 int r; 191 192 if (output->gpr >= bc->ngpr) 193 bc->ngpr = output->gpr + 1; 194 195 if (bc->cf_last && (bc->cf_last->op == output->op || 196 (bc->cf_last->op == CF_OP_EXPORT && 197 output->op == CF_OP_EXPORT_DONE)) && 198 output->type == bc->cf_last->output.type && 199 output->elem_size == bc->cf_last->output.elem_size && 200 output->swizzle_x == bc->cf_last->output.swizzle_x && 201 output->swizzle_y == bc->cf_last->output.swizzle_y && 202 output->swizzle_z == bc->cf_last->output.swizzle_z && 203 output->swizzle_w == bc->cf_last->output.swizzle_w && 204 output->comp_mask == bc->cf_last->output.comp_mask && 205 (output->burst_count + bc->cf_last->output.burst_count) <= 16) { 206 207 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr && 208 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) { 209 210 bc->cf_last->op = bc->cf_last->output.op = output->op; 211 bc->cf_last->output.gpr = output->gpr; 212 bc->cf_last->output.array_base = output->array_base; 213 bc->cf_last->output.burst_count += output->burst_count; 214 return 0; 215 216 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) && 217 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) { 218 219 bc->cf_last->op = bc->cf_last->output.op = output->op; 220 bc->cf_last->output.burst_count += output->burst_count; 221 return 0; 222 } 223 } 224 225 r = r600_bytecode_add_cf(bc); 226 if (r) 227 return r; 228 bc->cf_last->op = output->op; 229 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output)); 230 bc->cf_last->barrier = 1; 231 return 0; 232 } 233 234 /* alu instructions that can ony exits once per group */ 235 static int is_alu_once_inst(struct r600_bytecode_alu *alu) 236 { 237 return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED) || alu->is_lds_idx_op || alu->op == ALU_OP0_GROUP_BARRIER; 238 } 239 240 static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 241 { 242 return (r600_isa_alu(alu->op)->flags & AF_REPL) && 243 (r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V); 244 } 245 246 static int is_alu_mova_inst(struct r600_bytecode_alu *alu) 247 { 248 return r600_isa_alu(alu->op)->flags & AF_MOVA; 249 } 250 251 static int alu_uses_rel(struct r600_bytecode_alu *alu) 252 { 253 unsigned num_src = r600_bytecode_get_num_operands(alu); 254 unsigned src; 255 256 if (alu->dst.rel) { 257 return 1; 258 } 259 260 for (src = 0; src < num_src; ++src) { 261 if (alu->src[src].rel) { 262 return 1; 263 } 264 } 265 return 0; 266 } 267 268 static int is_lds_read(int sel) 269 { 270 return sel == EG_V_SQ_ALU_SRC_LDS_OQ_A_POP || sel == EG_V_SQ_ALU_SRC_LDS_OQ_B_POP; 271 } 272 273 static int alu_uses_lds(struct r600_bytecode_alu *alu) 274 { 275 unsigned num_src = r600_bytecode_get_num_operands(alu); 276 unsigned src; 277 278 for (src = 0; src < num_src; ++src) { 279 if (is_lds_read(alu->src[src].sel)) { 280 return 1; 281 } 282 } 283 return 0; 284 } 285 286 static int is_alu_64bit_inst(struct r600_bytecode_alu *alu) 287 { 288 const struct alu_op_info *op = r600_isa_alu(alu->op); 289 return (op->flags & AF_64); 290 } 291 292 static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 293 { 294 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 295 return !(slots & AF_S); 296 } 297 298 static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 299 { 300 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 301 return !(slots & AF_V); 302 } 303 304 /* alu instructions that can execute on any unit */ 305 static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 306 { 307 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 308 return slots == AF_VS; 309 } 310 311 static int is_nop_inst(struct r600_bytecode_alu *alu) 312 { 313 return alu->op == ALU_OP0_NOP; 314 } 315 316 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first, 317 struct r600_bytecode_alu *assignment[5]) 318 { 319 struct r600_bytecode_alu *alu; 320 unsigned i, chan, trans; 321 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 322 323 for (i = 0; i < max_slots; i++) 324 assignment[i] = NULL; 325 326 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) { 327 chan = alu->dst.chan; 328 if (max_slots == 4) 329 trans = 0; 330 else if (is_alu_trans_unit_inst(bc, alu)) 331 trans = 1; 332 else if (is_alu_vec_unit_inst(bc, alu)) 333 trans = 0; 334 else if (assignment[chan]) 335 trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */ 336 else 337 trans = 0; 338 339 if (trans) { 340 if (assignment[4]) { 341 assert(0); /* ALU.Trans has already been allocated. */ 342 return -1; 343 } 344 assignment[4] = alu; 345 } else { 346 if (assignment[chan]) { 347 assert(0); /* ALU.chan has already been allocated. */ 348 return -1; 349 } 350 assignment[chan] = alu; 351 } 352 353 if (alu->last) 354 break; 355 } 356 return 0; 357 } 358 359 struct alu_bank_swizzle { 360 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS]; 361 int hw_cfile_addr[4]; 362 int hw_cfile_elem[4]; 363 }; 364 365 static const unsigned cycle_for_bank_swizzle_vec[][3] = { 366 [SQ_ALU_VEC_012] = { 0, 1, 2 }, 367 [SQ_ALU_VEC_021] = { 0, 2, 1 }, 368 [SQ_ALU_VEC_120] = { 1, 2, 0 }, 369 [SQ_ALU_VEC_102] = { 1, 0, 2 }, 370 [SQ_ALU_VEC_201] = { 2, 0, 1 }, 371 [SQ_ALU_VEC_210] = { 2, 1, 0 } 372 }; 373 374 static const unsigned cycle_for_bank_swizzle_scl[][3] = { 375 [SQ_ALU_SCL_210] = { 2, 1, 0 }, 376 [SQ_ALU_SCL_122] = { 1, 2, 2 }, 377 [SQ_ALU_SCL_212] = { 2, 1, 2 }, 378 [SQ_ALU_SCL_221] = { 2, 2, 1 } 379 }; 380 381 static void init_bank_swizzle(struct alu_bank_swizzle *bs) 382 { 383 int i, cycle, component; 384 /* set up gpr use */ 385 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++) 386 for (component = 0; component < NUM_OF_COMPONENTS; component++) 387 bs->hw_gpr[cycle][component] = -1; 388 for (i = 0; i < 4; i++) 389 bs->hw_cfile_addr[i] = -1; 390 for (i = 0; i < 4; i++) 391 bs->hw_cfile_elem[i] = -1; 392 } 393 394 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle) 395 { 396 if (bs->hw_gpr[cycle][chan] == -1) 397 bs->hw_gpr[cycle][chan] = sel; 398 else if (bs->hw_gpr[cycle][chan] != (int)sel) { 399 /* Another scalar operation has already used the GPR read port for the channel. */ 400 return -1; 401 } 402 return 0; 403 } 404 405 static int reserve_cfile(const struct r600_bytecode *bc, 406 struct alu_bank_swizzle *bs, unsigned sel, unsigned chan) 407 { 408 int res, num_res = 4; 409 if (bc->chip_class >= R700) { 410 num_res = 2; 411 chan /= 2; 412 } 413 for (res = 0; res < num_res; ++res) { 414 if (bs->hw_cfile_addr[res] == -1) { 415 bs->hw_cfile_addr[res] = sel; 416 bs->hw_cfile_elem[res] = chan; 417 return 0; 418 } else if (bs->hw_cfile_addr[res] == sel && 419 bs->hw_cfile_elem[res] == chan) 420 return 0; /* Read for this scalar element already reserved, nothing to do here. */ 421 } 422 /* All cfile read ports are used, cannot reference vector element. */ 423 return -1; 424 } 425 426 static int is_gpr(unsigned sel) 427 { 428 return (sel <= 127); 429 } 430 431 /* CB constants start at 512, and get translated to a kcache index when ALU 432 * clauses are constructed. Note that we handle kcache constants the same way 433 * as (the now gone) cfile constants, is that really required? */ 434 static int is_cfile(unsigned sel) 435 { 436 return (sel > 255 && sel < 512) || 437 (sel > 511 && sel < 4607) || /* Kcache before translation. */ 438 (sel > 127 && sel < 192); /* Kcache after translation. */ 439 } 440 441 static int is_const(int sel) 442 { 443 return is_cfile(sel) || 444 (sel >= V_SQ_ALU_SRC_0 && 445 sel <= V_SQ_ALU_SRC_LITERAL); 446 } 447 448 static int check_vector(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, 449 struct alu_bank_swizzle *bs, int bank_swizzle) 450 { 451 int r, src, num_src, sel, elem, cycle; 452 453 num_src = r600_bytecode_get_num_operands(alu); 454 for (src = 0; src < num_src; src++) { 455 sel = alu->src[src].sel; 456 elem = alu->src[src].chan; 457 if (is_gpr(sel)) { 458 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src]; 459 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan) 460 /* Nothing to do; special-case optimization, 461 * second source uses first sources reservation. */ 462 continue; 463 else { 464 r = reserve_gpr(bs, sel, elem, cycle); 465 if (r) 466 return r; 467 } 468 } else if (is_cfile(sel)) { 469 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 470 if (r) 471 return r; 472 } 473 /* No restrictions on PV, PS, literal or special constants. */ 474 } 475 return 0; 476 } 477 478 static int check_scalar(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, 479 struct alu_bank_swizzle *bs, int bank_swizzle) 480 { 481 int r, src, num_src, const_count, sel, elem, cycle; 482 483 num_src = r600_bytecode_get_num_operands(alu); 484 for (const_count = 0, src = 0; src < num_src; ++src) { 485 sel = alu->src[src].sel; 486 elem = alu->src[src].chan; 487 if (is_const(sel)) { /* Any constant, including literal and inline constants. */ 488 if (const_count >= 2) 489 /* More than two references to a constant in 490 * transcendental operation. */ 491 return -1; 492 else 493 const_count++; 494 } 495 if (is_cfile(sel)) { 496 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 497 if (r) 498 return r; 499 } 500 } 501 for (src = 0; src < num_src; ++src) { 502 sel = alu->src[src].sel; 503 elem = alu->src[src].chan; 504 if (is_gpr(sel)) { 505 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 506 if (cycle < const_count) 507 /* Cycle for GPR load conflicts with 508 * constant load in transcendental operation. */ 509 return -1; 510 r = reserve_gpr(bs, sel, elem, cycle); 511 if (r) 512 return r; 513 } 514 /* PV PS restrictions */ 515 if (const_count && (sel == 254 || sel == 255)) { 516 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 517 if (cycle < const_count) 518 return -1; 519 } 520 } 521 return 0; 522 } 523 524 static int check_and_set_bank_swizzle(const struct r600_bytecode *bc, 525 struct r600_bytecode_alu *slots[5]) 526 { 527 struct alu_bank_swizzle bs; 528 int bank_swizzle[5]; 529 int i, r = 0, forced = 1; 530 boolean scalar_only = bc->chip_class == CAYMAN ? false : true; 531 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 532 533 for (i = 0; i < max_slots; i++) { 534 if (slots[i]) { 535 if (slots[i]->bank_swizzle_force) { 536 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force; 537 } else { 538 forced = 0; 539 } 540 } 541 542 if (i < 4 && slots[i]) 543 scalar_only = false; 544 } 545 if (forced) 546 return 0; 547 548 /* Just check every possible combination of bank swizzle. 549 * Not very efficent, but works on the first try in most of the cases. */ 550 for (i = 0; i < 4; i++) 551 if (!slots[i] || !slots[i]->bank_swizzle_force) 552 bank_swizzle[i] = SQ_ALU_VEC_012; 553 else 554 bank_swizzle[i] = slots[i]->bank_swizzle; 555 556 bank_swizzle[4] = SQ_ALU_SCL_210; 557 while(bank_swizzle[4] <= SQ_ALU_SCL_221) { 558 559 init_bank_swizzle(&bs); 560 if (scalar_only == false) { 561 for (i = 0; i < 4; i++) { 562 if (slots[i]) { 563 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]); 564 if (r) 565 break; 566 } 567 } 568 } else 569 r = 0; 570 571 if (!r && max_slots == 5 && slots[4]) { 572 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]); 573 } 574 if (!r) { 575 for (i = 0; i < max_slots; i++) { 576 if (slots[i]) 577 slots[i]->bank_swizzle = bank_swizzle[i]; 578 } 579 return 0; 580 } 581 582 if (scalar_only) { 583 bank_swizzle[4]++; 584 } else { 585 for (i = 0; i < max_slots; i++) { 586 if (!slots[i] || !slots[i]->bank_swizzle_force) { 587 bank_swizzle[i]++; 588 if (bank_swizzle[i] <= SQ_ALU_VEC_210) 589 break; 590 else if (i < max_slots - 1) 591 bank_swizzle[i] = SQ_ALU_VEC_012; 592 else 593 return -1; 594 } 595 } 596 } 597 } 598 599 /* Couldn't find a working swizzle. */ 600 return -1; 601 } 602 603 static int replace_gpr_with_pv_ps(struct r600_bytecode *bc, 604 struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev) 605 { 606 struct r600_bytecode_alu *prev[5]; 607 int gpr[5], chan[5]; 608 int i, j, r, src, num_src; 609 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 610 611 r = assign_alu_units(bc, alu_prev, prev); 612 if (r) 613 return r; 614 615 for (i = 0; i < max_slots; ++i) { 616 if (prev[i] && alu_writes(prev[i]) && !prev[i]->dst.rel) { 617 618 if (is_alu_64bit_inst(prev[i])) { 619 gpr[i] = -1; 620 continue; 621 } 622 623 gpr[i] = prev[i]->dst.sel; 624 /* cube writes more than PV.X */ 625 if (is_alu_reduction_inst(bc, prev[i])) 626 chan[i] = 0; 627 else 628 chan[i] = prev[i]->dst.chan; 629 } else 630 gpr[i] = -1; 631 } 632 633 for (i = 0; i < max_slots; ++i) { 634 struct r600_bytecode_alu *alu = slots[i]; 635 if (!alu) 636 continue; 637 638 if (is_alu_64bit_inst(alu)) 639 continue; 640 num_src = r600_bytecode_get_num_operands(alu); 641 for (src = 0; src < num_src; ++src) { 642 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel) 643 continue; 644 645 if (bc->chip_class < CAYMAN) { 646 if (alu->src[src].sel == gpr[4] && 647 alu->src[src].chan == chan[4] && 648 alu_prev->pred_sel == alu->pred_sel) { 649 alu->src[src].sel = V_SQ_ALU_SRC_PS; 650 alu->src[src].chan = 0; 651 continue; 652 } 653 } 654 655 for (j = 0; j < 4; ++j) { 656 if (alu->src[src].sel == gpr[j] && 657 alu->src[src].chan == j && 658 alu_prev->pred_sel == alu->pred_sel) { 659 alu->src[src].sel = V_SQ_ALU_SRC_PV; 660 alu->src[src].chan = chan[j]; 661 break; 662 } 663 } 664 } 665 } 666 667 return 0; 668 } 669 670 void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg, unsigned abs) 671 { 672 switch(value) { 673 case 0: 674 *sel = V_SQ_ALU_SRC_0; 675 break; 676 case 1: 677 *sel = V_SQ_ALU_SRC_1_INT; 678 break; 679 case -1: 680 *sel = V_SQ_ALU_SRC_M_1_INT; 681 break; 682 case 0x3F800000: /* 1.0f */ 683 *sel = V_SQ_ALU_SRC_1; 684 break; 685 case 0x3F000000: /* 0.5f */ 686 *sel = V_SQ_ALU_SRC_0_5; 687 break; 688 case 0xBF800000: /* -1.0f */ 689 *sel = V_SQ_ALU_SRC_1; 690 *neg ^= !abs; 691 break; 692 case 0xBF000000: /* -0.5f */ 693 *sel = V_SQ_ALU_SRC_0_5; 694 *neg ^= !abs; 695 break; 696 default: 697 *sel = V_SQ_ALU_SRC_LITERAL; 698 break; 699 } 700 } 701 702 /* compute how many literal are needed */ 703 static int r600_bytecode_alu_nliterals(struct r600_bytecode_alu *alu, 704 uint32_t literal[4], unsigned *nliteral) 705 { 706 unsigned num_src = r600_bytecode_get_num_operands(alu); 707 unsigned i, j; 708 709 for (i = 0; i < num_src; ++i) { 710 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 711 uint32_t value = alu->src[i].value; 712 unsigned found = 0; 713 for (j = 0; j < *nliteral; ++j) { 714 if (literal[j] == value) { 715 found = 1; 716 break; 717 } 718 } 719 if (!found) { 720 if (*nliteral >= 4) 721 return -EINVAL; 722 literal[(*nliteral)++] = value; 723 } 724 } 725 } 726 return 0; 727 } 728 729 static void r600_bytecode_alu_adjust_literals(struct r600_bytecode_alu *alu, 730 uint32_t literal[4], unsigned nliteral) 731 { 732 unsigned num_src = r600_bytecode_get_num_operands(alu); 733 unsigned i, j; 734 735 for (i = 0; i < num_src; ++i) { 736 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 737 uint32_t value = alu->src[i].value; 738 for (j = 0; j < nliteral; ++j) { 739 if (literal[j] == value) { 740 alu->src[i].chan = j; 741 break; 742 } 743 } 744 } 745 } 746 } 747 748 static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5], 749 struct r600_bytecode_alu *alu_prev) 750 { 751 struct r600_bytecode_alu *prev[5]; 752 struct r600_bytecode_alu *result[5] = { NULL }; 753 754 uint32_t literal[4], prev_literal[4]; 755 unsigned nliteral = 0, prev_nliteral = 0; 756 757 int i, j, r, src, num_src; 758 int num_once_inst = 0; 759 int have_mova = 0, have_rel = 0; 760 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 761 762 r = assign_alu_units(bc, alu_prev, prev); 763 if (r) 764 return r; 765 766 for (i = 0; i < max_slots; ++i) { 767 if (prev[i]) { 768 if (prev[i]->pred_sel) 769 return 0; 770 if (is_alu_once_inst(prev[i])) 771 return 0; 772 } 773 if (slots[i]) { 774 if (slots[i]->pred_sel) 775 return 0; 776 if (is_alu_once_inst(slots[i])) 777 return 0; 778 } 779 } 780 781 for (i = 0; i < max_slots; ++i) { 782 struct r600_bytecode_alu *alu; 783 784 if (num_once_inst > 0) 785 return 0; 786 787 /* check number of literals */ 788 if (prev[i]) { 789 if (r600_bytecode_alu_nliterals(prev[i], literal, &nliteral)) 790 return 0; 791 if (r600_bytecode_alu_nliterals(prev[i], prev_literal, &prev_nliteral)) 792 return 0; 793 if (is_alu_mova_inst(prev[i])) { 794 if (have_rel) 795 return 0; 796 have_mova = 1; 797 } 798 799 if (alu_uses_rel(prev[i])) { 800 if (have_mova) { 801 return 0; 802 } 803 have_rel = 1; 804 } 805 if (alu_uses_lds(prev[i])) 806 return 0; 807 808 num_once_inst += is_alu_once_inst(prev[i]); 809 } 810 if (slots[i] && r600_bytecode_alu_nliterals(slots[i], literal, &nliteral)) 811 return 0; 812 813 /* Let's check used slots. */ 814 if (prev[i] && !slots[i]) { 815 result[i] = prev[i]; 816 continue; 817 } else if (prev[i] && slots[i]) { 818 if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) { 819 /* Trans unit is still free try to use it. */ 820 if (is_alu_any_unit_inst(bc, slots[i]) && !alu_uses_lds(slots[i])) { 821 result[i] = prev[i]; 822 result[4] = slots[i]; 823 } else if (is_alu_any_unit_inst(bc, prev[i])) { 824 if (slots[i]->dst.sel == prev[i]->dst.sel && 825 alu_writes(slots[i]) && 826 alu_writes(prev[i])) 827 return 0; 828 829 result[i] = slots[i]; 830 result[4] = prev[i]; 831 } else 832 return 0; 833 } else 834 return 0; 835 } else if(!slots[i]) { 836 continue; 837 } else { 838 if (max_slots == 5 && slots[i] && prev[4] && 839 slots[i]->dst.sel == prev[4]->dst.sel && 840 slots[i]->dst.chan == prev[4]->dst.chan && 841 alu_writes(slots[i]) && 842 alu_writes(prev[4])) 843 return 0; 844 845 result[i] = slots[i]; 846 } 847 848 alu = slots[i]; 849 num_once_inst += is_alu_once_inst(alu); 850 851 /* don't reschedule NOPs */ 852 if (is_nop_inst(alu)) 853 return 0; 854 855 if (is_alu_mova_inst(alu)) { 856 if (have_rel) { 857 return 0; 858 } 859 have_mova = 1; 860 } 861 862 if (alu_uses_rel(alu)) { 863 if (have_mova) { 864 return 0; 865 } 866 have_rel = 1; 867 } 868 869 if (alu->op == ALU_OP0_SET_CF_IDX0 || 870 alu->op == ALU_OP0_SET_CF_IDX1) 871 return 0; /* data hazard with MOVA */ 872 873 /* Let's check source gprs */ 874 num_src = r600_bytecode_get_num_operands(alu); 875 for (src = 0; src < num_src; ++src) { 876 877 /* Constants don't matter. */ 878 if (!is_gpr(alu->src[src].sel)) 879 continue; 880 881 for (j = 0; j < max_slots; ++j) { 882 if (!prev[j] || !alu_writes(prev[j])) 883 continue; 884 885 /* If it's relative then we can't determin which gpr is really used. */ 886 if (prev[j]->dst.chan == alu->src[src].chan && 887 (prev[j]->dst.sel == alu->src[src].sel || 888 prev[j]->dst.rel || alu->src[src].rel)) 889 return 0; 890 } 891 } 892 } 893 894 /* more than one PRED_ or KILL_ ? */ 895 if (num_once_inst > 1) 896 return 0; 897 898 /* check if the result can still be swizzlet */ 899 r = check_and_set_bank_swizzle(bc, result); 900 if (r) 901 return 0; 902 903 /* looks like everything worked out right, apply the changes */ 904 905 /* undo adding previus literals */ 906 bc->cf_last->ndw -= align(prev_nliteral, 2); 907 908 /* sort instructions */ 909 for (i = 0; i < max_slots; ++i) { 910 slots[i] = result[i]; 911 if (result[i]) { 912 LIST_DEL(&result[i]->list); 913 result[i]->last = 0; 914 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu); 915 } 916 } 917 918 /* determine new last instruction */ 919 LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1; 920 921 /* determine new first instruction */ 922 for (i = 0; i < max_slots; ++i) { 923 if (result[i]) { 924 bc->cf_last->curr_bs_head = result[i]; 925 break; 926 } 927 } 928 929 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head; 930 bc->cf_last->prev2_bs_head = NULL; 931 932 return 0; 933 } 934 935 /* we'll keep kcache sets sorted by bank & addr */ 936 static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc, 937 struct r600_bytecode_kcache *kcache, 938 unsigned bank, unsigned line, unsigned index_mode) 939 { 940 int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2; 941 942 for (i = 0; i < kcache_banks; i++) { 943 if (kcache[i].mode) { 944 int d; 945 946 if (kcache[i].bank < bank) 947 continue; 948 949 if ((kcache[i].bank == bank && kcache[i].addr > line+1) || 950 kcache[i].bank > bank) { 951 /* try to insert new line */ 952 if (kcache[kcache_banks-1].mode) { 953 /* all sets are in use */ 954 return -ENOMEM; 955 } 956 957 memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache)); 958 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 959 kcache[i].bank = bank; 960 kcache[i].addr = line; 961 kcache[i].index_mode = index_mode; 962 return 0; 963 } 964 965 d = line - kcache[i].addr; 966 967 if (d == -1) { 968 kcache[i].addr--; 969 if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) { 970 /* we are prepending the line to the current set, 971 * discarding the existing second line, 972 * so we'll have to insert line+2 after it */ 973 line += 2; 974 continue; 975 } else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) { 976 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 977 return 0; 978 } else { 979 /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */ 980 return -ENOMEM; 981 } 982 } else if (d == 1) { 983 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 984 return 0; 985 } else if (d == 0) 986 return 0; 987 } else { /* free kcache set - use it */ 988 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 989 kcache[i].bank = bank; 990 kcache[i].addr = line; 991 kcache[i].index_mode = index_mode; 992 return 0; 993 } 994 } 995 return -ENOMEM; 996 } 997 998 static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc, 999 struct r600_bytecode_kcache *kcache, 1000 struct r600_bytecode_alu *alu) 1001 { 1002 int i, r; 1003 1004 for (i = 0; i < 3; i++) { 1005 unsigned bank, line, sel = alu->src[i].sel, index_mode; 1006 1007 if (sel < 512) 1008 continue; 1009 1010 bank = alu->src[i].kc_bank; 1011 assert(bank < R600_MAX_HW_CONST_BUFFERS); 1012 line = (sel-512)>>4; 1013 index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / V_SQ_CF_INDEX_NONE 1014 1015 if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line, index_mode))) 1016 return r; 1017 } 1018 return 0; 1019 } 1020 1021 static int r600_bytecode_assign_kcache_banks( 1022 struct r600_bytecode_alu *alu, 1023 struct r600_bytecode_kcache * kcache) 1024 { 1025 int i, j; 1026 1027 /* Alter the src operands to refer to the kcache. */ 1028 for (i = 0; i < 3; ++i) { 1029 static const unsigned int base[] = {128, 160, 256, 288}; 1030 unsigned int line, sel = alu->src[i].sel, found = 0; 1031 1032 if (sel < 512) 1033 continue; 1034 1035 sel -= 512; 1036 line = sel>>4; 1037 1038 for (j = 0; j < 4 && !found; ++j) { 1039 switch (kcache[j].mode) { 1040 case V_SQ_CF_KCACHE_NOP: 1041 case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX: 1042 R600_ERR("unexpected kcache line mode\n"); 1043 return -ENOMEM; 1044 default: 1045 if (kcache[j].bank == alu->src[i].kc_bank && 1046 kcache[j].addr <= line && 1047 line < kcache[j].addr + kcache[j].mode) { 1048 alu->src[i].sel = sel - (kcache[j].addr<<4); 1049 alu->src[i].sel += base[j]; 1050 found=1; 1051 } 1052 } 1053 } 1054 } 1055 return 0; 1056 } 1057 1058 static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, 1059 struct r600_bytecode_alu *alu, 1060 unsigned type) 1061 { 1062 struct r600_bytecode_kcache kcache_sets[4]; 1063 struct r600_bytecode_kcache *kcache = kcache_sets; 1064 int r; 1065 1066 memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1067 1068 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1069 /* can't alloc, need to start new clause */ 1070 if ((r = r600_bytecode_add_cf(bc))) { 1071 return r; 1072 } 1073 bc->cf_last->op = type; 1074 1075 /* retry with the new clause */ 1076 kcache = bc->cf_last->kcache; 1077 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1078 /* can't alloc again- should never happen */ 1079 return r; 1080 } 1081 } else { 1082 /* update kcache sets */ 1083 memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1084 } 1085 1086 /* if we actually used more than 2 kcache sets, or have relative indexing - use ALU_EXTENDED on eg+ */ 1087 if (kcache[2].mode != V_SQ_CF_KCACHE_NOP || 1088 kcache[0].index_mode || kcache[1].index_mode || kcache[2].index_mode || kcache[3].index_mode) { 1089 if (bc->chip_class < EVERGREEN) 1090 return -ENOMEM; 1091 bc->cf_last->eg_alu_extended = 1; 1092 } 1093 1094 return 0; 1095 } 1096 1097 static int insert_nop_r6xx(struct r600_bytecode *bc) 1098 { 1099 struct r600_bytecode_alu alu; 1100 int r, i; 1101 1102 for (i = 0; i < 4; i++) { 1103 memset(&alu, 0, sizeof(alu)); 1104 alu.op = ALU_OP0_NOP; 1105 alu.src[0].chan = i; 1106 alu.dst.chan = i; 1107 alu.last = (i == 3); 1108 r = r600_bytecode_add_alu(bc, &alu); 1109 if (r) 1110 return r; 1111 } 1112 return 0; 1113 } 1114 1115 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1116 static int load_ar_r6xx(struct r600_bytecode *bc) 1117 { 1118 struct r600_bytecode_alu alu; 1119 int r; 1120 1121 if (bc->ar_loaded) 1122 return 0; 1123 1124 /* hack to avoid making MOVA the last instruction in the clause */ 1125 if ((bc->cf_last->ndw>>1) >= 110) 1126 bc->force_add_cf = 1; 1127 1128 memset(&alu, 0, sizeof(alu)); 1129 alu.op = ALU_OP1_MOVA_GPR_INT; 1130 alu.src[0].sel = bc->ar_reg; 1131 alu.src[0].chan = bc->ar_chan; 1132 alu.last = 1; 1133 alu.index_mode = INDEX_MODE_LOOP; 1134 r = r600_bytecode_add_alu(bc, &alu); 1135 if (r) 1136 return r; 1137 1138 /* no requirement to set uses waterfall on MOVA_GPR_INT */ 1139 bc->ar_loaded = 1; 1140 return 0; 1141 } 1142 1143 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1144 static int load_ar(struct r600_bytecode *bc) 1145 { 1146 struct r600_bytecode_alu alu; 1147 int r; 1148 1149 if (bc->ar_handling) 1150 return load_ar_r6xx(bc); 1151 1152 if (bc->ar_loaded) 1153 return 0; 1154 1155 /* hack to avoid making MOVA the last instruction in the clause */ 1156 if ((bc->cf_last->ndw>>1) >= 110) 1157 bc->force_add_cf = 1; 1158 1159 memset(&alu, 0, sizeof(alu)); 1160 alu.op = ALU_OP1_MOVA_INT; 1161 alu.src[0].sel = bc->ar_reg; 1162 alu.src[0].chan = bc->ar_chan; 1163 alu.last = 1; 1164 r = r600_bytecode_add_alu(bc, &alu); 1165 if (r) 1166 return r; 1167 1168 bc->cf_last->r6xx_uses_waterfall = 1; 1169 bc->ar_loaded = 1; 1170 return 0; 1171 } 1172 1173 int r600_bytecode_add_alu_type(struct r600_bytecode *bc, 1174 const struct r600_bytecode_alu *alu, unsigned type) 1175 { 1176 struct r600_bytecode_alu *nalu = r600_bytecode_alu(); 1177 struct r600_bytecode_alu *lalu; 1178 int i, r; 1179 1180 if (!nalu) 1181 return -ENOMEM; 1182 memcpy(nalu, alu, sizeof(struct r600_bytecode_alu)); 1183 1184 if (alu->is_op3) { 1185 /* will fail later since alu does not support it. */ 1186 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1187 } 1188 1189 if (bc->cf_last != NULL && bc->cf_last->op != type) { 1190 /* check if we could add it anyway */ 1191 if (bc->cf_last->op == CF_OP_ALU && 1192 type == CF_OP_ALU_PUSH_BEFORE) { 1193 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) { 1194 if (lalu->execute_mask) { 1195 bc->force_add_cf = 1; 1196 break; 1197 } 1198 } 1199 } else 1200 bc->force_add_cf = 1; 1201 } 1202 1203 /* cf can contains only alu or only vtx or only tex */ 1204 if (bc->cf_last == NULL || bc->force_add_cf) { 1205 r = r600_bytecode_add_cf(bc); 1206 if (r) { 1207 free(nalu); 1208 return r; 1209 } 1210 } 1211 bc->cf_last->op = type; 1212 1213 /* Load index register if required */ 1214 if (bc->chip_class >= EVERGREEN) { 1215 for (i = 0; i < 3; i++) 1216 if (nalu->src[i].kc_bank && nalu->src[i].kc_rel) 1217 egcm_load_index_reg(bc, 0, true); 1218 } 1219 1220 /* Check AR usage and load it if required */ 1221 for (i = 0; i < 3; i++) 1222 if (nalu->src[i].rel && !bc->ar_loaded) 1223 load_ar(bc); 1224 1225 if (nalu->dst.rel && !bc->ar_loaded) 1226 load_ar(bc); 1227 1228 /* Setup the kcache for this ALU instruction. This will start a new 1229 * ALU clause if needed. */ 1230 if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) { 1231 free(nalu); 1232 return r; 1233 } 1234 1235 if (!bc->cf_last->curr_bs_head) { 1236 bc->cf_last->curr_bs_head = nalu; 1237 } 1238 /* number of gpr == the last gpr used in any alu */ 1239 for (i = 0; i < 3; i++) { 1240 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) { 1241 bc->ngpr = nalu->src[i].sel + 1; 1242 } 1243 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL) 1244 r600_bytecode_special_constants(nalu->src[i].value, 1245 &nalu->src[i].sel, &nalu->src[i].neg, nalu->src[i].abs); 1246 } 1247 if (nalu->dst.sel >= bc->ngpr) { 1248 bc->ngpr = nalu->dst.sel + 1; 1249 } 1250 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu); 1251 /* each alu use 2 dwords */ 1252 bc->cf_last->ndw += 2; 1253 bc->ndw += 2; 1254 1255 /* process cur ALU instructions for bank swizzle */ 1256 if (nalu->last) { 1257 uint32_t literal[4]; 1258 unsigned nliteral; 1259 struct r600_bytecode_alu *slots[5]; 1260 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 1261 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots); 1262 if (r) 1263 return r; 1264 1265 if (bc->cf_last->prev_bs_head) { 1266 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head); 1267 if (r) 1268 return r; 1269 } 1270 1271 if (bc->cf_last->prev_bs_head) { 1272 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head); 1273 if (r) 1274 return r; 1275 } 1276 1277 r = check_and_set_bank_swizzle(bc, slots); 1278 if (r) 1279 return r; 1280 1281 for (i = 0, nliteral = 0; i < max_slots; i++) { 1282 if (slots[i]) { 1283 r = r600_bytecode_alu_nliterals(slots[i], literal, &nliteral); 1284 if (r) 1285 return r; 1286 } 1287 } 1288 bc->cf_last->ndw += align(nliteral, 2); 1289 1290 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots) 1291 * worst case */ 1292 if ((bc->cf_last->ndw >> 1) >= 120) { 1293 bc->force_add_cf = 1; 1294 } 1295 1296 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head; 1297 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head; 1298 bc->cf_last->curr_bs_head = NULL; 1299 } 1300 1301 if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst) 1302 insert_nop_r6xx(bc); 1303 1304 return 0; 1305 } 1306 1307 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu) 1308 { 1309 return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU); 1310 } 1311 1312 static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc) 1313 { 1314 switch (bc->chip_class) { 1315 case R600: 1316 return 8; 1317 1318 case R700: 1319 case EVERGREEN: 1320 case CAYMAN: 1321 return 16; 1322 1323 default: 1324 R600_ERR("Unknown chip class %d.\n", bc->chip_class); 1325 return 8; 1326 } 1327 } 1328 1329 static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc) 1330 { 1331 return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) && 1332 bc->cf_last->op != CF_OP_GDS && 1333 (bc->chip_class == CAYMAN || 1334 bc->cf_last->op != CF_OP_TEX)); 1335 } 1336 1337 static int r600_bytecode_add_vtx_internal(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx, 1338 bool use_tc) 1339 { 1340 struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx(); 1341 int r; 1342 1343 if (!nvtx) 1344 return -ENOMEM; 1345 memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx)); 1346 1347 /* Load index register if required */ 1348 if (bc->chip_class >= EVERGREEN) { 1349 if (vtx->buffer_index_mode) 1350 egcm_load_index_reg(bc, vtx->buffer_index_mode - 1, false); 1351 } 1352 1353 /* cf can contains only alu or only vtx or only tex */ 1354 if (bc->cf_last == NULL || 1355 last_inst_was_not_vtx_fetch(bc) || 1356 bc->force_add_cf) { 1357 r = r600_bytecode_add_cf(bc); 1358 if (r) { 1359 free(nvtx); 1360 return r; 1361 } 1362 switch (bc->chip_class) { 1363 case R600: 1364 case R700: 1365 bc->cf_last->op = CF_OP_VTX; 1366 break; 1367 case EVERGREEN: 1368 if (use_tc) 1369 bc->cf_last->op = CF_OP_TEX; 1370 else 1371 bc->cf_last->op = CF_OP_VTX; 1372 break; 1373 case CAYMAN: 1374 bc->cf_last->op = CF_OP_TEX; 1375 break; 1376 default: 1377 R600_ERR("Unknown chip class %d.\n", bc->chip_class); 1378 free(nvtx); 1379 return -EINVAL; 1380 } 1381 } 1382 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx); 1383 /* each fetch use 4 dwords */ 1384 bc->cf_last->ndw += 4; 1385 bc->ndw += 4; 1386 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1387 bc->force_add_cf = 1; 1388 1389 bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1); 1390 bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1); 1391 1392 return 0; 1393 } 1394 1395 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx) 1396 { 1397 return r600_bytecode_add_vtx_internal(bc, vtx, false); 1398 } 1399 1400 int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx) 1401 { 1402 return r600_bytecode_add_vtx_internal(bc, vtx, true); 1403 } 1404 1405 int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex) 1406 { 1407 struct r600_bytecode_tex *ntex = r600_bytecode_tex(); 1408 int r; 1409 1410 if (!ntex) 1411 return -ENOMEM; 1412 memcpy(ntex, tex, sizeof(struct r600_bytecode_tex)); 1413 1414 /* Load index register if required */ 1415 if (bc->chip_class >= EVERGREEN) { 1416 if (tex->sampler_index_mode || tex->resource_index_mode) 1417 egcm_load_index_reg(bc, 1, false); 1418 } 1419 1420 /* we can't fetch data und use it as texture lookup address in the same TEX clause */ 1421 if (bc->cf_last != NULL && 1422 bc->cf_last->op == CF_OP_TEX) { 1423 struct r600_bytecode_tex *ttex; 1424 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) { 1425 if (ttex->dst_gpr == ntex->src_gpr) { 1426 bc->force_add_cf = 1; 1427 break; 1428 } 1429 } 1430 /* slight hack to make gradients always go into same cf */ 1431 if (ntex->op == FETCH_OP_SET_GRADIENTS_H) 1432 bc->force_add_cf = 1; 1433 } 1434 1435 /* cf can contains only alu or only vtx or only tex */ 1436 if (bc->cf_last == NULL || 1437 bc->cf_last->op != CF_OP_TEX || 1438 bc->force_add_cf) { 1439 r = r600_bytecode_add_cf(bc); 1440 if (r) { 1441 free(ntex); 1442 return r; 1443 } 1444 bc->cf_last->op = CF_OP_TEX; 1445 } 1446 if (ntex->src_gpr >= bc->ngpr) { 1447 bc->ngpr = ntex->src_gpr + 1; 1448 } 1449 if (ntex->dst_gpr >= bc->ngpr) { 1450 bc->ngpr = ntex->dst_gpr + 1; 1451 } 1452 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex); 1453 /* each texture fetch use 4 dwords */ 1454 bc->cf_last->ndw += 4; 1455 bc->ndw += 4; 1456 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1457 bc->force_add_cf = 1; 1458 return 0; 1459 } 1460 1461 int r600_bytecode_add_gds(struct r600_bytecode *bc, const struct r600_bytecode_gds *gds) 1462 { 1463 struct r600_bytecode_gds *ngds = r600_bytecode_gds(); 1464 int r; 1465 1466 if (ngds == NULL) 1467 return -ENOMEM; 1468 memcpy(ngds, gds, sizeof(struct r600_bytecode_gds)); 1469 1470 if (bc->chip_class >= EVERGREEN) { 1471 if (gds->uav_index_mode) 1472 egcm_load_index_reg(bc, gds->uav_index_mode - 1, false); 1473 } 1474 1475 if (bc->cf_last == NULL || 1476 bc->cf_last->op != CF_OP_GDS || 1477 bc->force_add_cf) { 1478 r = r600_bytecode_add_cf(bc); 1479 if (r) { 1480 free(ngds); 1481 return r; 1482 } 1483 bc->cf_last->op = CF_OP_GDS; 1484 } 1485 1486 LIST_ADDTAIL(&ngds->list, &bc->cf_last->gds); 1487 bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */ 1488 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1489 bc->force_add_cf = 1; 1490 return 0; 1491 } 1492 1493 int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op) 1494 { 1495 int r; 1496 r = r600_bytecode_add_cf(bc); 1497 if (r) 1498 return r; 1499 1500 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE; 1501 bc->cf_last->op = op; 1502 return 0; 1503 } 1504 1505 int cm_bytecode_add_cf_end(struct r600_bytecode *bc) 1506 { 1507 return r600_bytecode_add_cfinst(bc, CF_OP_CF_END); 1508 } 1509 1510 /* common to all 3 families */ 1511 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id) 1512 { 1513 bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(vtx->op) | 1514 S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | 1515 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) | 1516 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) | 1517 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x); 1518 if (bc->chip_class < CAYMAN) 1519 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count); 1520 id++; 1521 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) | 1522 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) | 1523 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) | 1524 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) | 1525 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) | 1526 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) | 1527 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) | 1528 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) | 1529 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) | 1530 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr); 1531 bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)| 1532 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian); 1533 if (bc->chip_class >= EVERGREEN) 1534 bc->bytecode[id] |= ((vtx->buffer_index_mode & 0x3) << 21); // S_SQ_VTX_WORD2_BIM(vtx->buffer_index_mode); 1535 if (bc->chip_class < CAYMAN) 1536 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1); 1537 id++; 1538 bc->bytecode[id++] = 0; 1539 return 0; 1540 } 1541 1542 /* common to all 3 families */ 1543 static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id) 1544 { 1545 bc->bytecode[id] = S_SQ_TEX_WORD0_TEX_INST( 1546 r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) | 1547 EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) | 1548 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) | 1549 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) | 1550 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel); 1551 if (bc->chip_class >= EVERGREEN) 1552 bc->bytecode[id] |= ((tex->sampler_index_mode & 0x3) << 27) | // S_SQ_TEX_WORD0_SIM(tex->sampler_index_mode); 1553 ((tex->resource_index_mode & 0x3) << 25); // S_SQ_TEX_WORD0_RIM(tex->resource_index_mode) 1554 id++; 1555 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) | 1556 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) | 1557 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) | 1558 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) | 1559 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) | 1560 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) | 1561 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) | 1562 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) | 1563 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) | 1564 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) | 1565 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w); 1566 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) | 1567 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) | 1568 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) | 1569 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) | 1570 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) | 1571 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) | 1572 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) | 1573 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w); 1574 bc->bytecode[id++] = 0; 1575 return 0; 1576 } 1577 1578 /* r600 only, r700/eg bits in r700_asm.c */ 1579 static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id) 1580 { 1581 unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op); 1582 1583 /* don't replace gpr by pv or ps for destination register */ 1584 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) | 1585 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) | 1586 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) | 1587 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) | 1588 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) | 1589 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) | 1590 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | 1591 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) | 1592 S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) | 1593 S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) | 1594 S_SQ_ALU_WORD0_LAST(alu->last); 1595 1596 if (alu->is_op3) { 1597 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1598 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1599 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1600 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1601 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1602 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) | 1603 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) | 1604 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) | 1605 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) | 1606 S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) | 1607 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle); 1608 } else { 1609 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1610 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1611 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1612 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1613 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) | 1614 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) | 1615 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) | 1616 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) | 1617 S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) | 1618 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) | 1619 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) | 1620 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred); 1621 } 1622 return 0; 1623 } 1624 1625 static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf) 1626 { 1627 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1); 1628 *bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) | 1629 S_SQ_CF_WORD1_BARRIER(1) | 1630 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1)| 1631 S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program); 1632 } 1633 1634 /* common for r600/r700 - eg in eg_asm.c */ 1635 static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) 1636 { 1637 unsigned id = cf->id; 1638 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1639 unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op); 1640 1641 1642 if (cf->op == CF_NATIVE) { 1643 bc->bytecode[id++] = cf->isa[0]; 1644 bc->bytecode[id++] = cf->isa[1]; 1645 } else if (cfop->flags & CF_ALU) { 1646 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) | 1647 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) | 1648 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) | 1649 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank); 1650 1651 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) | 1652 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) | 1653 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) | 1654 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) | 1655 S_SQ_CF_ALU_WORD1_BARRIER(1) | 1656 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) | 1657 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1); 1658 } else if (cfop->flags & CF_FETCH) { 1659 if (bc->chip_class == R700) 1660 r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1661 else 1662 r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1663 } else if (cfop->flags & CF_EXP) { 1664 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1665 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1666 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1667 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1668 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1669 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1670 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) | 1671 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) | 1672 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) | 1673 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) | 1674 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1675 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1676 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program); 1677 } else if (cfop->flags & CF_MEM) { 1678 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1679 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1680 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1681 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1682 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1683 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1684 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1685 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1686 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program) | 1687 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) | 1688 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask); 1689 } else { 1690 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1); 1691 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) | 1692 S_SQ_CF_WORD1_BARRIER(1) | 1693 S_SQ_CF_WORD1_COND(cf->cond) | 1694 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) | 1695 S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program); 1696 } 1697 return 0; 1698 } 1699 1700 int r600_bytecode_build(struct r600_bytecode *bc) 1701 { 1702 struct r600_bytecode_cf *cf; 1703 struct r600_bytecode_alu *alu; 1704 struct r600_bytecode_vtx *vtx; 1705 struct r600_bytecode_tex *tex; 1706 struct r600_bytecode_gds *gds; 1707 uint32_t literal[4]; 1708 unsigned nliteral; 1709 unsigned addr; 1710 int i, r; 1711 1712 if (!bc->nstack) { // If not 0, Stack_size already provided by llvm 1713 if (bc->stack.max_entries) 1714 bc->nstack = bc->stack.max_entries; 1715 else if (bc->type == PIPE_SHADER_VERTEX || 1716 bc->type == PIPE_SHADER_TESS_EVAL || 1717 bc->type == PIPE_SHADER_TESS_CTRL) 1718 bc->nstack = 1; 1719 } 1720 1721 /* first path compute addr of each CF block */ 1722 /* addr start after all the CF instructions */ 1723 addr = bc->cf_last->id + 2; 1724 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1725 if (r600_isa_cf(cf->op)->flags & CF_FETCH) { 1726 addr += 3; 1727 addr &= 0xFFFFFFFCUL; 1728 } 1729 cf->addr = addr; 1730 addr += cf->ndw; 1731 bc->ndw = cf->addr + cf->ndw; 1732 } 1733 free(bc->bytecode); 1734 bc->bytecode = calloc(4, bc->ndw); 1735 if (bc->bytecode == NULL) 1736 return -ENOMEM; 1737 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1738 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1739 addr = cf->addr; 1740 if (bc->chip_class >= EVERGREEN) 1741 r = eg_bytecode_cf_build(bc, cf); 1742 else 1743 r = r600_bytecode_cf_build(bc, cf); 1744 if (r) 1745 return r; 1746 if (cfop->flags & CF_ALU) { 1747 nliteral = 0; 1748 memset(literal, 0, sizeof(literal)); 1749 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 1750 r = r600_bytecode_alu_nliterals(alu, literal, &nliteral); 1751 if (r) 1752 return r; 1753 r600_bytecode_alu_adjust_literals(alu, literal, nliteral); 1754 r600_bytecode_assign_kcache_banks(alu, cf->kcache); 1755 1756 switch(bc->chip_class) { 1757 case R600: 1758 r = r600_bytecode_alu_build(bc, alu, addr); 1759 break; 1760 case R700: 1761 r = r700_bytecode_alu_build(bc, alu, addr); 1762 break; 1763 case EVERGREEN: 1764 case CAYMAN: 1765 r = eg_bytecode_alu_build(bc, alu, addr); 1766 break; 1767 default: 1768 R600_ERR("unknown chip class %d.\n", bc->chip_class); 1769 return -EINVAL; 1770 } 1771 if (r) 1772 return r; 1773 addr += 2; 1774 if (alu->last) { 1775 for (i = 0; i < align(nliteral, 2); ++i) { 1776 bc->bytecode[addr++] = literal[i]; 1777 } 1778 nliteral = 0; 1779 memset(literal, 0, sizeof(literal)); 1780 } 1781 } 1782 } else if (cf->op == CF_OP_VTX) { 1783 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1784 r = r600_bytecode_vtx_build(bc, vtx, addr); 1785 if (r) 1786 return r; 1787 addr += 4; 1788 } 1789 } else if (cf->op == CF_OP_GDS) { 1790 assert(bc->chip_class >= EVERGREEN); 1791 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 1792 r = eg_bytecode_gds_build(bc, gds, addr); 1793 if (r) 1794 return r; 1795 addr += 4; 1796 } 1797 } else if (cf->op == CF_OP_TEX) { 1798 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1799 assert(bc->chip_class >= EVERGREEN); 1800 r = r600_bytecode_vtx_build(bc, vtx, addr); 1801 if (r) 1802 return r; 1803 addr += 4; 1804 } 1805 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 1806 r = r600_bytecode_tex_build(bc, tex, addr); 1807 if (r) 1808 return r; 1809 addr += 4; 1810 } 1811 } 1812 } 1813 return 0; 1814 } 1815 1816 void r600_bytecode_clear(struct r600_bytecode *bc) 1817 { 1818 struct r600_bytecode_cf *cf = NULL, *next_cf; 1819 1820 free(bc->bytecode); 1821 bc->bytecode = NULL; 1822 1823 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) { 1824 struct r600_bytecode_alu *alu = NULL, *next_alu; 1825 struct r600_bytecode_tex *tex = NULL, *next_tex; 1826 struct r600_bytecode_tex *vtx = NULL, *next_vtx; 1827 struct r600_bytecode_gds *gds = NULL, *next_gds; 1828 1829 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) { 1830 free(alu); 1831 } 1832 1833 LIST_INITHEAD(&cf->alu); 1834 1835 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) { 1836 free(tex); 1837 } 1838 1839 LIST_INITHEAD(&cf->tex); 1840 1841 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) { 1842 free(vtx); 1843 } 1844 1845 LIST_INITHEAD(&cf->vtx); 1846 1847 LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) { 1848 free(gds); 1849 } 1850 1851 LIST_INITHEAD(&cf->gds); 1852 1853 free(cf); 1854 } 1855 1856 LIST_INITHEAD(&cf->list); 1857 } 1858 1859 static int print_swizzle(unsigned swz) 1860 { 1861 const char * swzchars = "xyzw01?_"; 1862 assert(swz<8 && swz != 6); 1863 return fprintf(stderr, "%c", swzchars[swz]); 1864 } 1865 1866 static int print_sel(unsigned sel, unsigned rel, unsigned index_mode, 1867 unsigned need_brackets) 1868 { 1869 int o = 0; 1870 if (rel && index_mode >= 5 && sel < 128) 1871 o += fprintf(stderr, "G"); 1872 if (rel || need_brackets) { 1873 o += fprintf(stderr, "["); 1874 } 1875 o += fprintf(stderr, "%d", sel); 1876 if (rel) { 1877 if (index_mode == 0 || index_mode == 6) 1878 o += fprintf(stderr, "+AR"); 1879 else if (index_mode == 4) 1880 o += fprintf(stderr, "+AL"); 1881 } 1882 if (rel || need_brackets) { 1883 o += fprintf(stderr, "]"); 1884 } 1885 return o; 1886 } 1887 1888 static int print_dst(struct r600_bytecode_alu *alu) 1889 { 1890 int o = 0; 1891 unsigned sel = alu->dst.sel; 1892 char reg_char = 'R'; 1893 if (sel > 128 - 4) { /* clause temporary gpr */ 1894 sel -= 128 - 4; 1895 reg_char = 'T'; 1896 } 1897 1898 if (alu_writes(alu)) { 1899 o += fprintf(stderr, "%c", reg_char); 1900 o += print_sel(alu->dst.sel, alu->dst.rel, alu->index_mode, 0); 1901 } else { 1902 o += fprintf(stderr, "__"); 1903 } 1904 o += fprintf(stderr, "."); 1905 o += print_swizzle(alu->dst.chan); 1906 return o; 1907 } 1908 1909 static int print_src(struct r600_bytecode_alu *alu, unsigned idx) 1910 { 1911 int o = 0; 1912 struct r600_bytecode_alu_src *src = &alu->src[idx]; 1913 unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0; 1914 1915 if (src->neg) 1916 o += fprintf(stderr,"-"); 1917 if (src->abs) 1918 o += fprintf(stderr,"|"); 1919 1920 if (sel < 128 - 4) { 1921 o += fprintf(stderr, "R"); 1922 } else if (sel < 128) { 1923 o += fprintf(stderr, "T"); 1924 sel -= 128 - 4; 1925 } else if (sel < 160) { 1926 o += fprintf(stderr, "KC0"); 1927 need_brackets = 1; 1928 sel -= 128; 1929 } else if (sel < 192) { 1930 o += fprintf(stderr, "KC1"); 1931 need_brackets = 1; 1932 sel -= 160; 1933 } else if (sel >= 512) { 1934 o += fprintf(stderr, "C%d", src->kc_bank); 1935 need_brackets = 1; 1936 sel -= 512; 1937 } else if (sel >= 448) { 1938 o += fprintf(stderr, "Param"); 1939 sel -= 448; 1940 need_chan = 0; 1941 } else if (sel >= 288) { 1942 o += fprintf(stderr, "KC3"); 1943 need_brackets = 1; 1944 sel -= 288; 1945 } else if (sel >= 256) { 1946 o += fprintf(stderr, "KC2"); 1947 need_brackets = 1; 1948 sel -= 256; 1949 } else { 1950 need_sel = 0; 1951 need_chan = 0; 1952 switch (sel) { 1953 case EG_V_SQ_ALU_SRC_LDS_DIRECT_A: 1954 o += fprintf(stderr, "LDS_A[0x%08X]", src->value); 1955 break; 1956 case EG_V_SQ_ALU_SRC_LDS_DIRECT_B: 1957 o += fprintf(stderr, "LDS_B[0x%08X]", src->value); 1958 break; 1959 case EG_V_SQ_ALU_SRC_LDS_OQ_A: 1960 o += fprintf(stderr, "LDS_OQ_A"); 1961 need_chan = 1; 1962 break; 1963 case EG_V_SQ_ALU_SRC_LDS_OQ_B: 1964 o += fprintf(stderr, "LDS_OQ_B"); 1965 need_chan = 1; 1966 break; 1967 case EG_V_SQ_ALU_SRC_LDS_OQ_A_POP: 1968 o += fprintf(stderr, "LDS_OQ_A_POP"); 1969 need_chan = 1; 1970 break; 1971 case EG_V_SQ_ALU_SRC_LDS_OQ_B_POP: 1972 o += fprintf(stderr, "LDS_OQ_B_POP"); 1973 need_chan = 1; 1974 break; 1975 case EG_V_SQ_ALU_SRC_SE_ID: 1976 o += fprintf(stderr, "SE_ID"); 1977 break; 1978 case EG_V_SQ_ALU_SRC_SIMD_ID: 1979 o += fprintf(stderr, "SIMD_ID"); 1980 break; 1981 case EG_V_SQ_ALU_SRC_HW_WAVE_ID: 1982 o += fprintf(stderr, "HW_WAVE_ID"); 1983 break; 1984 case V_SQ_ALU_SRC_PS: 1985 o += fprintf(stderr, "PS"); 1986 break; 1987 case V_SQ_ALU_SRC_PV: 1988 o += fprintf(stderr, "PV"); 1989 need_chan = 1; 1990 break; 1991 case V_SQ_ALU_SRC_LITERAL: 1992 o += fprintf(stderr, "[0x%08X %f]", src->value, u_bitcast_u2f(src->value)); 1993 break; 1994 case V_SQ_ALU_SRC_0_5: 1995 o += fprintf(stderr, "0.5"); 1996 break; 1997 case V_SQ_ALU_SRC_M_1_INT: 1998 o += fprintf(stderr, "-1"); 1999 break; 2000 case V_SQ_ALU_SRC_1_INT: 2001 o += fprintf(stderr, "1"); 2002 break; 2003 case V_SQ_ALU_SRC_1: 2004 o += fprintf(stderr, "1.0"); 2005 break; 2006 case V_SQ_ALU_SRC_0: 2007 o += fprintf(stderr, "0"); 2008 break; 2009 default: 2010 o += fprintf(stderr, "??IMM_%d", sel); 2011 break; 2012 } 2013 } 2014 2015 if (need_sel) 2016 o += print_sel(sel, src->rel, alu->index_mode, need_brackets); 2017 2018 if (need_chan) { 2019 o += fprintf(stderr, "."); 2020 o += print_swizzle(src->chan); 2021 } 2022 2023 if (src->abs) 2024 o += fprintf(stderr,"|"); 2025 2026 return o; 2027 } 2028 2029 static int print_indent(int p, int c) 2030 { 2031 int o = 0; 2032 while (p++ < c) 2033 o += fprintf(stderr, " "); 2034 return o; 2035 } 2036 2037 void r600_bytecode_disasm(struct r600_bytecode *bc) 2038 { 2039 const char *index_mode[] = {"CF_INDEX_NONE", "CF_INDEX_0", "CF_INDEX_1"}; 2040 static int index = 0; 2041 struct r600_bytecode_cf *cf = NULL; 2042 struct r600_bytecode_alu *alu = NULL; 2043 struct r600_bytecode_vtx *vtx = NULL; 2044 struct r600_bytecode_tex *tex = NULL; 2045 struct r600_bytecode_gds *gds = NULL; 2046 2047 unsigned i, id, ngr = 0, last; 2048 uint32_t literal[4]; 2049 unsigned nliteral; 2050 char chip = '6'; 2051 2052 switch (bc->chip_class) { 2053 case R700: 2054 chip = '7'; 2055 break; 2056 case EVERGREEN: 2057 chip = 'E'; 2058 break; 2059 case CAYMAN: 2060 chip = 'C'; 2061 break; 2062 case R600: 2063 default: 2064 chip = '6'; 2065 break; 2066 } 2067 fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n", 2068 bc->ndw, bc->ngpr, bc->nstack); 2069 fprintf(stderr, "shader %d -- %c\n", index++, chip); 2070 2071 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 2072 id = cf->id; 2073 if (cf->op == CF_NATIVE) { 2074 fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id], 2075 bc->bytecode[id + 1]); 2076 } else { 2077 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 2078 if (cfop->flags & CF_ALU) { 2079 if (cf->eg_alu_extended) { 2080 fprintf(stderr, "%04d %08X %08X %s\n", id, bc->bytecode[id], 2081 bc->bytecode[id + 1], "ALU_EXT"); 2082 id += 2; 2083 } 2084 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2085 bc->bytecode[id + 1], cfop->name); 2086 fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr); 2087 for (i = 0; i < 4; ++i) { 2088 if (cf->kcache[i].mode) { 2089 int c_start = (cf->kcache[i].addr << 4); 2090 int c_end = c_start + (cf->kcache[i].mode << 4); 2091 fprintf(stderr, "KC%d[CB%d:%d-%d%s%s] ", 2092 i, cf->kcache[i].bank, c_start, c_end, 2093 cf->kcache[i].index_mode ? " " : "", 2094 cf->kcache[i].index_mode ? index_mode[cf->kcache[i].index_mode] : ""); 2095 } 2096 } 2097 fprintf(stderr, "\n"); 2098 } else if (cfop->flags & CF_FETCH) { 2099 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2100 bc->bytecode[id + 1], cfop->name); 2101 fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr); 2102 fprintf(stderr, "\n"); 2103 if (cf->end_of_program) 2104 fprintf(stderr, "EOP "); 2105 } else if (cfop->flags & CF_EXP) { 2106 int o = 0; 2107 const char *exp_type[] = {"PIXEL", "POS ", "PARAM"}; 2108 o += fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2109 bc->bytecode[id + 1], cfop->name); 2110 o += print_indent(o, 43); 2111 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2112 if (cf->output.burst_count > 1) { 2113 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2114 cf->output.array_base + cf->output.burst_count - 1); 2115 2116 o += print_indent(o, 55); 2117 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2118 cf->output.gpr + cf->output.burst_count - 1); 2119 } else { 2120 o += fprintf(stderr, "%d ", cf->output.array_base); 2121 o += print_indent(o, 55); 2122 o += fprintf(stderr, "R%d.", cf->output.gpr); 2123 } 2124 2125 o += print_swizzle(cf->output.swizzle_x); 2126 o += print_swizzle(cf->output.swizzle_y); 2127 o += print_swizzle(cf->output.swizzle_z); 2128 o += print_swizzle(cf->output.swizzle_w); 2129 2130 print_indent(o, 67); 2131 2132 fprintf(stderr, " ES:%X ", cf->output.elem_size); 2133 if (cf->mark) 2134 fprintf(stderr, "MARK "); 2135 if (!cf->barrier) 2136 fprintf(stderr, "NO_BARRIER "); 2137 if (cf->end_of_program) 2138 fprintf(stderr, "EOP "); 2139 fprintf(stderr, "\n"); 2140 } else if (r600_isa_cf(cf->op)->flags & CF_MEM) { 2141 int o = 0; 2142 const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK", 2143 "WRITE_IND_ACK"}; 2144 o += fprintf(stderr, "%04d %08X %08X %s ", id, 2145 bc->bytecode[id], bc->bytecode[id + 1], cfop->name); 2146 o += print_indent(o, 43); 2147 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2148 2149 if (r600_isa_cf(cf->op)->flags & CF_RAT) { 2150 o += fprintf(stderr, "RAT%d", cf->rat.id); 2151 if (cf->rat.index_mode) { 2152 o += fprintf(stderr, "[IDX%d]", cf->rat.index_mode - 1); 2153 } 2154 o += fprintf(stderr, " INST: %d ", cf->rat.inst); 2155 } 2156 2157 if (cf->output.burst_count > 1) { 2158 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2159 cf->output.array_base + cf->output.burst_count - 1); 2160 o += print_indent(o, 55); 2161 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2162 cf->output.gpr + cf->output.burst_count - 1); 2163 } else { 2164 o += fprintf(stderr, "%d ", cf->output.array_base); 2165 o += print_indent(o, 55); 2166 o += fprintf(stderr, "R%d.", cf->output.gpr); 2167 } 2168 for (i = 0; i < 4; ++i) { 2169 if (cf->output.comp_mask & (1 << i)) 2170 o += print_swizzle(i); 2171 else 2172 o += print_swizzle(7); 2173 } 2174 2175 if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND || 2176 cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND) 2177 o += fprintf(stderr, " R%d", cf->output.index_gpr); 2178 2179 o += print_indent(o, 67); 2180 2181 fprintf(stderr, " ES:%i ", cf->output.elem_size); 2182 if (cf->output.array_size != 0xFFF) 2183 fprintf(stderr, "AS:%i ", cf->output.array_size); 2184 if (cf->mark) 2185 fprintf(stderr, "MARK "); 2186 if (!cf->barrier) 2187 fprintf(stderr, "NO_BARRIER "); 2188 if (cf->end_of_program) 2189 fprintf(stderr, "EOP "); 2190 fprintf(stderr, "\n"); 2191 } else { 2192 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2193 bc->bytecode[id + 1], cfop->name); 2194 fprintf(stderr, "@%d ", cf->cf_addr); 2195 if (cf->cond) 2196 fprintf(stderr, "CND:%X ", cf->cond); 2197 if (cf->pop_count) 2198 fprintf(stderr, "POP:%X ", cf->pop_count); 2199 if (cf->count && (cfop->flags & CF_EMIT)) 2200 fprintf(stderr, "STREAM%d ", cf->count); 2201 if (cf->end_of_program) 2202 fprintf(stderr, "EOP "); 2203 fprintf(stderr, "\n"); 2204 } 2205 } 2206 2207 id = cf->addr; 2208 nliteral = 0; 2209 last = 1; 2210 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 2211 const char *omod_str[] = {"","*2","*4","/2"}; 2212 const struct alu_op_info *aop = r600_isa_alu(alu->op); 2213 int o = 0; 2214 2215 r600_bytecode_alu_nliterals(alu, literal, &nliteral); 2216 o += fprintf(stderr, " %04d %08X %08X ", id, bc->bytecode[id], bc->bytecode[id+1]); 2217 if (last) 2218 o += fprintf(stderr, "%4d ", ++ngr); 2219 else 2220 o += fprintf(stderr, " "); 2221 o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ', 2222 alu->update_pred ? 'P':' ', 2223 alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' '); 2224 2225 o += fprintf(stderr, "%s%s%s ", aop->name, 2226 omod_str[alu->omod], alu->dst.clamp ? "_sat":""); 2227 2228 o += print_indent(o,60); 2229 o += print_dst(alu); 2230 for (i = 0; i < aop->src_count; ++i) { 2231 o += fprintf(stderr, i == 0 ? ", ": ", "); 2232 o += print_src(alu, i); 2233 } 2234 2235 if (alu->bank_swizzle) { 2236 o += print_indent(o,75); 2237 o += fprintf(stderr, " BS:%d", alu->bank_swizzle); 2238 } 2239 2240 fprintf(stderr, "\n"); 2241 id += 2; 2242 2243 if (alu->last) { 2244 for (i = 0; i < nliteral; i++, id++) { 2245 float *f = (float*)(bc->bytecode + id); 2246 o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]); 2247 print_indent(o, 60); 2248 fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id)); 2249 } 2250 id += nliteral & 1; 2251 nliteral = 0; 2252 } 2253 last = alu->last; 2254 } 2255 2256 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 2257 int o = 0; 2258 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2259 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2260 2261 o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name); 2262 2263 o += print_indent(o, 50); 2264 2265 o += fprintf(stderr, "R%d.", tex->dst_gpr); 2266 o += print_swizzle(tex->dst_sel_x); 2267 o += print_swizzle(tex->dst_sel_y); 2268 o += print_swizzle(tex->dst_sel_z); 2269 o += print_swizzle(tex->dst_sel_w); 2270 2271 o += fprintf(stderr, ", R%d.", tex->src_gpr); 2272 o += print_swizzle(tex->src_sel_x); 2273 o += print_swizzle(tex->src_sel_y); 2274 o += print_swizzle(tex->src_sel_z); 2275 o += print_swizzle(tex->src_sel_w); 2276 2277 o += fprintf(stderr, ", RID:%d", tex->resource_id); 2278 o += fprintf(stderr, ", SID:%d ", tex->sampler_id); 2279 2280 if (tex->sampler_index_mode) 2281 fprintf(stderr, "SQ_%s ", index_mode[tex->sampler_index_mode]); 2282 2283 if (tex->lod_bias) 2284 fprintf(stderr, "LB:%d ", tex->lod_bias); 2285 2286 fprintf(stderr, "CT:%c%c%c%c ", 2287 tex->coord_type_x ? 'N' : 'U', 2288 tex->coord_type_y ? 'N' : 'U', 2289 tex->coord_type_z ? 'N' : 'U', 2290 tex->coord_type_w ? 'N' : 'U'); 2291 2292 if (tex->offset_x) 2293 fprintf(stderr, "OX:%d ", tex->offset_x); 2294 if (tex->offset_y) 2295 fprintf(stderr, "OY:%d ", tex->offset_y); 2296 if (tex->offset_z) 2297 fprintf(stderr, "OZ:%d ", tex->offset_z); 2298 2299 id += 4; 2300 fprintf(stderr, "\n"); 2301 } 2302 2303 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 2304 int o = 0; 2305 const char * fetch_type[] = {"VERTEX", "INSTANCE", ""}; 2306 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2307 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2308 2309 o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name); 2310 2311 o += print_indent(o, 50); 2312 2313 o += fprintf(stderr, "R%d.", vtx->dst_gpr); 2314 o += print_swizzle(vtx->dst_sel_x); 2315 o += print_swizzle(vtx->dst_sel_y); 2316 o += print_swizzle(vtx->dst_sel_z); 2317 o += print_swizzle(vtx->dst_sel_w); 2318 2319 o += fprintf(stderr, ", R%d.", vtx->src_gpr); 2320 o += print_swizzle(vtx->src_sel_x); 2321 2322 if (vtx->offset) 2323 fprintf(stderr, " +%db", vtx->offset); 2324 2325 o += print_indent(o, 55); 2326 2327 fprintf(stderr, ", RID:%d ", vtx->buffer_id); 2328 2329 fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]); 2330 2331 if (bc->chip_class < CAYMAN && vtx->mega_fetch_count) 2332 fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count); 2333 2334 if (bc->chip_class >= EVERGREEN && vtx->buffer_index_mode) 2335 fprintf(stderr, "SQ_%s ", index_mode[vtx->buffer_index_mode]); 2336 2337 fprintf(stderr, "UCF:%d ", vtx->use_const_fields); 2338 fprintf(stderr, "FMT(DTA:%d ", vtx->data_format); 2339 fprintf(stderr, "NUM:%d ", vtx->num_format_all); 2340 fprintf(stderr, "COMP:%d ", vtx->format_comp_all); 2341 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all); 2342 2343 id += 4; 2344 } 2345 2346 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 2347 int o = 0; 2348 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2349 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2350 2351 o += fprintf(stderr, "%s ", r600_isa_fetch(gds->op)->name); 2352 2353 if (gds->op != FETCH_OP_TF_WRITE) { 2354 o += fprintf(stderr, "R%d.", gds->dst_gpr); 2355 o += print_swizzle(gds->dst_sel_x); 2356 o += print_swizzle(gds->dst_sel_y); 2357 o += print_swizzle(gds->dst_sel_z); 2358 o += print_swizzle(gds->dst_sel_w); 2359 } 2360 2361 o += fprintf(stderr, ", R%d.", gds->src_gpr); 2362 o += print_swizzle(gds->src_sel_x); 2363 o += print_swizzle(gds->src_sel_y); 2364 o += print_swizzle(gds->src_sel_z); 2365 2366 if (gds->op != FETCH_OP_TF_WRITE) { 2367 o += fprintf(stderr, ", R%d.", gds->src_gpr2); 2368 } 2369 if (gds->alloc_consume) { 2370 o += fprintf(stderr, " UAV: %d", gds->uav_id); 2371 if (gds->uav_index_mode) 2372 o += fprintf(stderr, "[%s]", index_mode[gds->uav_index_mode]); 2373 } 2374 fprintf(stderr, "\n"); 2375 id += 4; 2376 } 2377 } 2378 2379 fprintf(stderr, "--------------------------------------\n"); 2380 } 2381 2382 void r600_vertex_data_type(enum pipe_format pformat, 2383 unsigned *format, 2384 unsigned *num_format, unsigned *format_comp, unsigned *endian) 2385 { 2386 const struct util_format_description *desc; 2387 unsigned i; 2388 2389 *format = 0; 2390 *num_format = 0; 2391 *format_comp = 0; 2392 *endian = ENDIAN_NONE; 2393 2394 if (pformat == PIPE_FORMAT_R11G11B10_FLOAT) { 2395 *format = FMT_10_11_11_FLOAT; 2396 *endian = r600_endian_swap(32); 2397 return; 2398 } 2399 2400 if (pformat == PIPE_FORMAT_B5G6R5_UNORM) { 2401 *format = FMT_5_6_5; 2402 *endian = r600_endian_swap(16); 2403 return; 2404 } 2405 2406 if (pformat == PIPE_FORMAT_B5G5R5A1_UNORM) { 2407 *format = FMT_1_5_5_5; 2408 *endian = r600_endian_swap(16); 2409 return; 2410 } 2411 2412 desc = util_format_description(pformat); 2413 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { 2414 goto out_unknown; 2415 } 2416 2417 /* Find the first non-VOID channel. */ 2418 for (i = 0; i < 4; i++) { 2419 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { 2420 break; 2421 } 2422 } 2423 2424 *endian = r600_endian_swap(desc->channel[i].size); 2425 2426 switch (desc->channel[i].type) { 2427 /* Half-floats, floats, ints */ 2428 case UTIL_FORMAT_TYPE_FLOAT: 2429 switch (desc->channel[i].size) { 2430 case 16: 2431 switch (desc->nr_channels) { 2432 case 1: 2433 *format = FMT_16_FLOAT; 2434 break; 2435 case 2: 2436 *format = FMT_16_16_FLOAT; 2437 break; 2438 case 3: 2439 case 4: 2440 *format = FMT_16_16_16_16_FLOAT; 2441 break; 2442 } 2443 break; 2444 case 32: 2445 switch (desc->nr_channels) { 2446 case 1: 2447 *format = FMT_32_FLOAT; 2448 break; 2449 case 2: 2450 *format = FMT_32_32_FLOAT; 2451 break; 2452 case 3: 2453 *format = FMT_32_32_32_FLOAT; 2454 break; 2455 case 4: 2456 *format = FMT_32_32_32_32_FLOAT; 2457 break; 2458 } 2459 break; 2460 default: 2461 goto out_unknown; 2462 } 2463 break; 2464 /* Unsigned ints */ 2465 case UTIL_FORMAT_TYPE_UNSIGNED: 2466 /* Signed ints */ 2467 case UTIL_FORMAT_TYPE_SIGNED: 2468 switch (desc->channel[i].size) { 2469 case 8: 2470 switch (desc->nr_channels) { 2471 case 1: 2472 *format = FMT_8; 2473 break; 2474 case 2: 2475 *format = FMT_8_8; 2476 break; 2477 case 3: 2478 case 4: 2479 *format = FMT_8_8_8_8; 2480 break; 2481 } 2482 break; 2483 case 10: 2484 if (desc->nr_channels != 4) 2485 goto out_unknown; 2486 2487 *format = FMT_2_10_10_10; 2488 break; 2489 case 16: 2490 switch (desc->nr_channels) { 2491 case 1: 2492 *format = FMT_16; 2493 break; 2494 case 2: 2495 *format = FMT_16_16; 2496 break; 2497 case 3: 2498 case 4: 2499 *format = FMT_16_16_16_16; 2500 break; 2501 } 2502 break; 2503 case 32: 2504 switch (desc->nr_channels) { 2505 case 1: 2506 *format = FMT_32; 2507 break; 2508 case 2: 2509 *format = FMT_32_32; 2510 break; 2511 case 3: 2512 *format = FMT_32_32_32; 2513 break; 2514 case 4: 2515 *format = FMT_32_32_32_32; 2516 break; 2517 } 2518 break; 2519 default: 2520 goto out_unknown; 2521 } 2522 break; 2523 default: 2524 goto out_unknown; 2525 } 2526 2527 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2528 *format_comp = 1; 2529 } 2530 2531 *num_format = 0; 2532 if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || 2533 desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2534 if (!desc->channel[i].normalized) { 2535 if (desc->channel[i].pure_integer) 2536 *num_format = 1; 2537 else 2538 *num_format = 2; 2539 } 2540 } 2541 return; 2542 out_unknown: 2543 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat)); 2544 } 2545 2546 void *r600_create_vertex_fetch_shader(struct pipe_context *ctx, 2547 unsigned count, 2548 const struct pipe_vertex_element *elements) 2549 { 2550 struct r600_context *rctx = (struct r600_context *)ctx; 2551 struct r600_bytecode bc; 2552 struct r600_bytecode_vtx vtx; 2553 const struct util_format_description *desc; 2554 unsigned fetch_resource_start = rctx->b.chip_class >= EVERGREEN ? 0 : 160; 2555 unsigned format, num_format, format_comp, endian; 2556 uint32_t *bytecode; 2557 int i, j, r, fs_size; 2558 struct r600_fetch_shader *shader; 2559 unsigned no_sb = rctx->screen->b.debug_flags & DBG_NO_SB; 2560 unsigned sb_disasm = !no_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 2561 2562 assert(count < 32); 2563 2564 memset(&bc, 0, sizeof(bc)); 2565 r600_bytecode_init(&bc, rctx->b.chip_class, rctx->b.family, 2566 rctx->screen->has_compressed_msaa_texturing); 2567 2568 bc.isa = rctx->isa; 2569 2570 for (i = 0; i < count; i++) { 2571 if (elements[i].instance_divisor > 1) { 2572 if (rctx->b.chip_class == CAYMAN) { 2573 for (j = 0; j < 4; j++) { 2574 struct r600_bytecode_alu alu; 2575 memset(&alu, 0, sizeof(alu)); 2576 alu.op = ALU_OP2_MULHI_UINT; 2577 alu.src[0].sel = 0; 2578 alu.src[0].chan = 3; 2579 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2580 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2581 alu.dst.sel = i + 1; 2582 alu.dst.chan = j; 2583 alu.dst.write = j == 3; 2584 alu.last = j == 3; 2585 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2586 r600_bytecode_clear(&bc); 2587 return NULL; 2588 } 2589 } 2590 } else { 2591 struct r600_bytecode_alu alu; 2592 memset(&alu, 0, sizeof(alu)); 2593 alu.op = ALU_OP2_MULHI_UINT; 2594 alu.src[0].sel = 0; 2595 alu.src[0].chan = 3; 2596 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2597 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2598 alu.dst.sel = i + 1; 2599 alu.dst.chan = 3; 2600 alu.dst.write = 1; 2601 alu.last = 1; 2602 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2603 r600_bytecode_clear(&bc); 2604 return NULL; 2605 } 2606 } 2607 } 2608 } 2609 2610 for (i = 0; i < count; i++) { 2611 r600_vertex_data_type(elements[i].src_format, 2612 &format, &num_format, &format_comp, &endian); 2613 2614 desc = util_format_description(elements[i].src_format); 2615 if (!desc) { 2616 r600_bytecode_clear(&bc); 2617 R600_ERR("unknown format %d\n", elements[i].src_format); 2618 return NULL; 2619 } 2620 2621 if (elements[i].src_offset > 65535) { 2622 r600_bytecode_clear(&bc); 2623 R600_ERR("too big src_offset: %u\n", elements[i].src_offset); 2624 return NULL; 2625 } 2626 2627 memset(&vtx, 0, sizeof(vtx)); 2628 vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start; 2629 vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA; 2630 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0; 2631 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0; 2632 vtx.mega_fetch_count = 0x1F; 2633 vtx.dst_gpr = i + 1; 2634 vtx.dst_sel_x = desc->swizzle[0]; 2635 vtx.dst_sel_y = desc->swizzle[1]; 2636 vtx.dst_sel_z = desc->swizzle[2]; 2637 vtx.dst_sel_w = desc->swizzle[3]; 2638 vtx.data_format = format; 2639 vtx.num_format_all = num_format; 2640 vtx.format_comp_all = format_comp; 2641 vtx.offset = elements[i].src_offset; 2642 vtx.endian = endian; 2643 2644 if ((r = r600_bytecode_add_vtx(&bc, &vtx))) { 2645 r600_bytecode_clear(&bc); 2646 return NULL; 2647 } 2648 } 2649 2650 r600_bytecode_add_cfinst(&bc, CF_OP_RET); 2651 2652 if ((r = r600_bytecode_build(&bc))) { 2653 r600_bytecode_clear(&bc); 2654 return NULL; 2655 } 2656 2657 if (rctx->screen->b.debug_flags & DBG_FS) { 2658 fprintf(stderr, "--------------------------------------------------------------\n"); 2659 fprintf(stderr, "Vertex elements state:\n"); 2660 for (i = 0; i < count; i++) { 2661 fprintf(stderr, " "); 2662 util_dump_vertex_element(stderr, elements+i); 2663 fprintf(stderr, "\n"); 2664 } 2665 2666 if (!sb_disasm) { 2667 r600_bytecode_disasm(&bc); 2668 2669 fprintf(stderr, "______________________________________________________________\n"); 2670 } else { 2671 r600_sb_bytecode_process(rctx, &bc, NULL, 1 /*dump*/, 0 /*optimize*/); 2672 } 2673 } 2674 2675 fs_size = bc.ndw*4; 2676 2677 /* Allocate the CSO. */ 2678 shader = CALLOC_STRUCT(r600_fetch_shader); 2679 if (!shader) { 2680 r600_bytecode_clear(&bc); 2681 return NULL; 2682 } 2683 2684 u_suballocator_alloc(rctx->allocator_fetch_shader, fs_size, 256, 2685 &shader->offset, 2686 (struct pipe_resource**)&shader->buffer); 2687 if (!shader->buffer) { 2688 r600_bytecode_clear(&bc); 2689 FREE(shader); 2690 return NULL; 2691 } 2692 2693 bytecode = r600_buffer_map_sync_with_rings(&rctx->b, shader->buffer, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED); 2694 bytecode += shader->offset / 4; 2695 2696 if (R600_BIG_ENDIAN) { 2697 for (i = 0; i < fs_size / 4; ++i) { 2698 bytecode[i] = util_cpu_to_le32(bc.bytecode[i]); 2699 } 2700 } else { 2701 memcpy(bytecode, bc.bytecode, fs_size); 2702 } 2703 rctx->b.ws->buffer_unmap(shader->buffer->buf); 2704 2705 r600_bytecode_clear(&bc); 2706 return shader; 2707 } 2708 2709 void r600_bytecode_alu_read(struct r600_bytecode *bc, 2710 struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1) 2711 { 2712 /* WORD0 */ 2713 alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0); 2714 alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0); 2715 alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0); 2716 alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0); 2717 alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0); 2718 alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0); 2719 alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0); 2720 alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0); 2721 alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0); 2722 alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0); 2723 alu->last = G_SQ_ALU_WORD0_LAST(word0); 2724 2725 /* WORD1 */ 2726 alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1); 2727 if (alu->bank_swizzle) 2728 alu->bank_swizzle_force = alu->bank_swizzle; 2729 alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1); 2730 alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1); 2731 alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1); 2732 alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1); 2733 if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/ 2734 { 2735 alu->is_op3 = 1; 2736 alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1); 2737 alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1); 2738 alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1); 2739 alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1); 2740 alu->op = r600_isa_alu_by_opcode(bc->isa, 2741 G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1); 2742 2743 } 2744 else /*ALU_DWORD1_OP2*/ 2745 { 2746 alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1); 2747 alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1); 2748 alu->op = r600_isa_alu_by_opcode(bc->isa, 2749 G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0); 2750 alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1); 2751 alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1); 2752 alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1); 2753 alu->execute_mask = 2754 G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1); 2755 } 2756 } 2757 2758 #if 0 2759 void r600_bytecode_export_read(struct r600_bytecode *bc, 2760 struct r600_bytecode_output *output, uint32_t word0, uint32_t word1) 2761 { 2762 output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0); 2763 output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0); 2764 output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0); 2765 output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0); 2766 2767 output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1); 2768 output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1); 2769 output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1); 2770 output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1); 2771 output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1); 2772 output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1); 2773 output->op = r600_isa_cf_by_opcode(bc->isa, 2774 G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0); 2775 output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1); 2776 output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1); 2777 output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1); 2778 } 2779 #endif 2780